diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bd4b7f0a..f7e7239f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -35,8 +35,13 @@ jobs:
- name: Style and static checks
run: mvn -B -ntp spotless:check checkstyle:check
+ - name: Build Rust runtime
+ run: cargo build --manifest-path runtime/doctruth-runtime/Cargo.toml --bins
+
- name: Verify (unit + integration + recorded LLM + coverage)
run: mvn -B -ntp verify -P recorded
+ env:
+ DOCTRUTH_RUNTIME_COMMAND: ${{ github.workspace }}/runtime/doctruth-runtime/target/debug/doctruth-runtime
- name: Resolve project version
run: echo "PROJECT_VERSION=$(mvn -q -DforceStdout help:evaluate -Dexpression=project.version)" >> "$GITHUB_ENV"
@@ -50,6 +55,12 @@ jobs:
- name: Smoke CLI release tarball
run: scripts/smoke-cli-release.sh --version "${PROJECT_VERSION}"
+ - name: Smoke parser accuracy seed corpus
+ run: scripts/smoke-doctruth-parser-accuracy-seed-corpus.sh
+
+ - name: Smoke real model suite skip path
+ run: scripts/smoke-doctruth-real-model-suite.sh
+
- name: Generate SBOM
run: mvn -B -ntp -DskipTests cyclonedx:makeAggregateBom
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 6031ad01..8c465994 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -34,8 +34,31 @@ jobs:
gpg-private-key: ${{ secrets.OSSRH_GPG_PRIVATE_KEY }}
gpg-passphrase: MAVEN_GPG_PASSPHRASE
+ - name: Set up Python 3.10 for real model smoke
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.10'
+ cache: pip
+
+ - name: Install real model smoke runtime dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y poppler-utils
+ python -m pip install --upgrade pip setuptools wheel
+ python -m pip install \
+ 'onnxruntime==1.26.0' \
+ 'pillow>=12,<13' \
+ 'numpy<2.4' \
+ 'paddleocr==3.7.0' \
+ 'paddlepaddle==3.3.1'
+
+ - name: Build Rust runtime
+ run: cargo build --manifest-path runtime/doctruth-runtime/Cargo.toml --bins
+
- name: Verify release commit
run: mvn -B -ntp spotless:check checkstyle:check verify -P recorded
+ env:
+ DOCTRUTH_RUNTIME_COMMAND: ${{ github.workspace }}/runtime/doctruth-runtime/target/debug/doctruth-runtime
- name: Package CLI release artifacts
run: scripts/package-cli-release.sh --version "${GITHUB_REF_NAME#v}"
@@ -43,6 +66,12 @@ jobs:
- name: Smoke CLI release tarball
run: scripts/smoke-cli-release.sh --version "${GITHUB_REF_NAME#v}"
+ - name: Smoke real model suite
+ run: scripts/smoke-doctruth-real-model-suite.sh
+ env:
+ DOCTRUTH_REAL_MODEL_SUITE: '1'
+ DOCTRUTH_SLANEXT_PYTHON: ${{ env.pythonLocation }}/bin/python
+
- name: Generate CycloneDX SBOM
run: |
mvn -B -ntp -DskipTests cyclonedx:makeAggregateBom
diff --git a/.gitignore b/.gitignore
index f4c9fa61..91b13a48 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
# Build output
target/
+__pycache__/
*.class
*.jar
*.war
@@ -54,6 +55,8 @@ docs/strategy/
# Test artifacts
**/test-output/
**/recordings/*.tmp.json
+third_party/opendataloader-bench/prediction/doctruth-runtime*/
+third_party/opendataloader-bench/prediction/doctruth-java-core-*/
# Real-world fixture corpus — never check in (may contain customer/PII data)
fixtures/
@@ -65,3 +68,6 @@ dist/
# Local Claude skill state (per-developer)
.claude/
+
+# Local git worktrees
+.worktrees/
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 00000000..84e5dad5
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,267 @@
+# DocTruth Agent Guide
+
+DocTruth is the document evidence engine in the doctruthhq stack. It turns
+documents into structured fields, exact source quotes, page/line/bbox citations,
+provenance, parser warnings, audit JSON, and `TrustDocument` output.
+
+## Runtime Architecture
+
+DocTruth's current parser-quality core is Java/PDFBox with
+OpenDataLoader-compatible processors. This is the quality source of truth until
+OpenDataLoader benchmark parity is reached and a separate Rust-core ADR is
+accepted.
+
+```text
+Java SDK / CLI / API
+ -> Java/OpenDataLoader-compatible parser core
+ -> TrustDocument
+ -> Rust runtime shell for corpus/model/process orchestration
+ -> evidence-native TrustDocument
+```
+
+Java/OpenDataLoader-compatible parser core is the current quality source of
+truth for:
+
+```text
+PDF parsing
+PDFBox compatibility
+text extraction
+layout geometry
+reading order
+table heuristics
+heading reconstruction
+parser warnings
+source refs
+TrustDocument normalization
+```
+
+Rust owns the runtime shell and Python replacement boundary:
+
+```text
+warm backend process lifecycle
+benchmark-corpus execution
+OpenDataLoader Bench prediction packaging
+resource accounting
+model/cache verification
+MNN model worker protocol
+Python/Torch/Docling replacement
+fail-closed model routing
+```
+
+`runtime/doctruth-runtime` is therefore the authoritative home for the local
+runtime shell, model-worker boundary, benchmark runner, resource reports, and
+future Rust parser modules. It is not allowed to silently replace the Java
+quality core until benchmark parity proves that replacement.
+
+`pdf_oxide` remains a useful Rust PDF substrate candidate and future parser
+module, but it is not the current default parser-quality source of truth for
+OpenDataLoader parity work.
+
+Java remains the stable enterprise-facing SDK, CLI, API, packaging, lifecycle,
+and current parser-quality backend. Java/PDFBox is not legacy-only in the
+current OpenDataLoader parity plan.
+
+Do not add new parser-quality, OCR/table/layout, model-execution,
+benchmark-corpus, audit-grade parser, or evidence-reconciliation behavior only
+to Rust when the Java/OpenDataLoader-compatible backend is the quality source of
+truth. Rust changes are aligned when they expose, package, run, measure, or
+model-augment behavior owned by the Java parser core.
+
+## Resource Gates
+
+Parser/model resource acceptance is profile-based. Do not use one absolute RSS
+number as a universal product gate.
+
+The product-level hard gates are:
+
+```text
+no Python/Torch/Docling production residency
+lazy model startup
+measurable model unload / idle recovery
+materially lower resource use than the measured heavy oracle on the same
+ machine and corpus
+no unexplained regression from a previously accepted named profile
+```
+
+Each accepted parser profile must record:
+
+```text
+profile name
+model manifest and model SHAs
+platform and architecture
+corpus scope
+measurement command
+cold-load RSS
+warm steady RSS
+peak RSS
+idle-after-unload RSS
+cold latency
+warm latency
+```
+
+Absolute RSS numbers are profiling budgets first. They become regression guards
+only after a benchmark report pins the exact profile. For example, if a Mac
+ARM64 `edge-model` profile with a specific MNN manifest measures 451MB warm
+steady RSS, that value belongs to that measured profile. The acceptance rule is
+that future runs must not materially regress from that profile without an
+updated benchmark report and rationale. Do not rewrite that as a global rule
+such as `edge-model steady RSS <= 600MB`, and do not express acceptance as an
+arithmetic shortcut such as `451MB + steady RSS <= 600MB`.
+
+Before that first report exists, use comparative evidence instead of a fixed
+number: no Python/Torch/Docling production residency, lazy model startup,
+measurable unload behavior, and materially lower resource use than the measured
+heavy oracle on the same machine and corpus.
+
+## Product Boundary
+
+DocTruth answers:
+
+```text
+Where did this extracted document field come from?
+```
+
+DocTruth should stay focused on document evidence. Do not expand it into agent
+memory, team workflow, hosted SaaS governance, insurance scoring, a vector
+database wrapper, or a general document chatbot. Commercial hosted governance
+belongs in Infer Cloud. Agent memory and replay ledger behavior belongs in
+Memtruth.
+
+## Public Contracts
+
+Keep these surfaces stable and versioned:
+
+```text
+TrustDocument
+TrustUnit
+TrustPage
+TrustTable
+EvidenceSpan/source-map semantics
+audit JSON
+parser warnings
+benchmark-corpus manifests
+Rust runtime stdin/stdout protocol
+Java SDK/CLI compatibility contracts
+```
+
+When changing parser behavior, add tests at the Rust runtime boundary first.
+For parser-quality behavior in the current OpenDataLoader parity plan, add Java
+backend tests first, then Rust runtime tests for process lifecycle, packaging,
+resource accounting, model-worker routing, and benchmark output.
+
+## Parser Reference Boundaries
+
+DocTruth can learn from strong parser projects, but they must not create
+competing canonical outputs:
+
+```text
+pdf_oxide Rust PDF substrate
+Kreuzberg Rust runtime/model/cache/worker architecture reference
+Docling unified document model and lossy export reference
+MinerU layered markdown/content-list/middle/debug output reference
+OpenDataLoader Apache-2.0 geometry, XY-Cut++, content filters, table rules
+DocTruth TrustDocument, citations, audit gates, source maps, replay
+```
+
+`TrustDocument` is the canonical contract. External parser outputs, Markdown,
+OpenDataLoader JSON, Docling-style JSON, MinerU-style `middle.json`, and model
+worker responses are observations that must be normalized into DocTruth-owned
+contracts before they can be audit-grade.
+
+Kreuzberg implementation code must not be copied because its code license is
+not compatible with DocTruth's OSS direction. OpenDataLoader PDF v2+
+Apache-2.0 implementation ideas may be ported only with attribution, source
+commit notes, and NOTICE updates. Prefer Java parser-core ports for parser
+quality first, with Rust ports added only after benchmark evidence supports
+them.
+
+OpenDataLoader Bench is vendored under
+`third_party/opendataloader-bench/` at the source commit recorded in its
+`SOURCE.md`. Treat it as the default external parser-quality benchmark
+foundation, not as a blocker waiting for DocTruth-owned human review. It
+already provides PDFs, ground-truth Markdown, prediction/evaluation artifacts,
+and evaluator code for reading-order, table, heading, and speed metrics.
+
+When parser-quality evidence is needed, first build or update a DocTruth ->
+OpenDataLoader Bench adapter:
+
+```text
+DocTruth Java/OpenDataLoader-compatible parser output
+ -> TrustDocument
+ -> Rust runtime shell packaging
+ -> OpenDataLoader Bench-compatible prediction markdown/artifact
+ -> OpenDataLoader Bench evaluator / evaluation.json
+ -> DocTruth benchmark report external_metrics
+ -> audit-grade parser-quality gate
+```
+
+OpenDataLoader parity is measured, not asserted. A behavior is considered
+ported only when it has a Java parser-core contract test, a Rust contract test
+at the shell boundary when runtime packaging is affected, an upstream source
+reference, and either a focused OpenDataLoader Bench case or a full200 report
+showing the effect. Until full200 reaches the accepted baseline, DocTruth should be
+described as OpenDataLoader-inspired and progressively porting parity, not
+OpenDataLoader-equivalent.
+
+Do not claim parser-quality work is blocked only because DocTruth lacks its own
+human-reviewed corpus. The DocTruth-owned human-reviewed corpus and review
+workstation are follow-up assets for evidence-specific labels. They supplement
+OpenDataLoader Bench; they do not replace it as the first external
+parser-quality gate.
+
+If multiple parser signals disagree, do not hide the conflict. Record parser
+provenance, emit warnings, and block audit-grade status for severe conflicts
+such as uncertain reading order, failed quote anchoring, missing visual bbox,
+or low-confidence table structure.
+
+## Verification
+
+For Java parser-quality changes:
+
+```bash
+mvn test
+mvn verify -P recorded
+git diff --check
+```
+
+For Rust runtime-shell, model-worker, or corpus changes:
+
+```bash
+cargo fmt --manifest-path runtime/doctruth-runtime/Cargo.toml -- --check
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml
+sh scripts/smoke-doctruth-runtime.sh
+git diff --check
+```
+
+For Rust model-worker or corpus changes, also run the relevant smoke:
+
+```bash
+sh scripts/smoke-doctruth-runtime-model-worker.sh
+sh scripts/smoke-doctruth-runtime-benchmark-corpus.sh
+```
+
+For Java SDK/CLI compatibility-only changes:
+
+```bash
+mvn test
+mvn verify -P recorded
+git diff --check
+```
+
+Do not claim complete OpenDataLoader parity while parser-quality,
+model/cache, layout/table/OCR, corpus, audit-grade, or evidence-reconciliation
+behavior lacks benchmark evidence. If a Rust parser path exists, it must be
+documented and tested as experimental or secondary until it matches the Java
+quality core on the benchmark gate.
+
+## Contribution Rules
+
+- Use TDD for non-trivial behavior changes.
+- Keep generated artifacts and private fixture corpora out of git.
+- Do not commit secrets, customer documents, API keys, or production-like data.
+- Add ADRs for dependencies that affect runtime, model execution, storage,
+ protocol, security, networking, cryptography, policy, public API shape, or
+ release packaging.
+- Prefer small, reviewable units, but split by responsibility rather than rigid
+ line-count rules.
+- One concept per commit and PR.
diff --git a/NOTICE b/NOTICE
index 03ac33bc..5c5b70dd 100644
--- a/NOTICE
+++ b/NOTICE
@@ -36,6 +36,39 @@ This software has runtime dependencies on the following open-source libraries
- Failsafe (dev.failsafe:failsafe) — Apache License 2.0
- Apache Commons Text (org.apache.commons:*) — Apache License 2.0
+Bundled third-party benchmark material:
+
+- OpenDataLoader Bench
+ Source: https://github.com/opendataloader-project/opendataloader-bench
+ Imported commit: 7af1d8f4d0c09f51ea1a5c6ba5f66e993286d109
+ Location: third_party/opendataloader-bench/
+ License: Apache License 2.0
+ Dataset notice: DP-Bench is listed by OpenDataLoader Bench as MIT in
+ third_party/opendataloader-bench/THIRD_PARTY_NOTICES.md.
+
+Reference implementations adapted in DocTruth-owned code:
+
+- OpenDataLoader PDF parser processors
+ Source: https://github.com/opendataloader-project/opendataloader-pdf
+ Reference commit: d1845179a1286bbb76f9618e8b6c8f51509a52f4
+ Location: third_party/opendataloader-pdf-reference
+ Reference files:
+ java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/readingorder/XYCutPlusPlusSorter.java
+ java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/readingorder/XYCutPlusPlusSorterTest.java
+ java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ContentFilterProcessor.java
+ java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TextProcessor.java
+ java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TextLineProcessor.java
+ java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ClusterTableProcessor.java
+ java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TableStructureNormalizer.java
+ java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/HeadingProcessor.java
+ License: Apache License 2.0
+ Adaptation: Java-owned geometry projection reading order in
+ src/main/java/ai/doctruth/PdfGeometryReadingOrderSorter.java plus
+ Rust-owned reading order, content filtering, line grouping, cluster-table
+ fallback, table-structure normalization, and heading behavior in
+ runtime/doctruth-runtime/src/lib.rs; TrustDocument remains the only canonical
+ DocTruth output contract.
+
Test-scope dependencies (not bundled in published artifacts):
- JUnit Jupiter, AssertJ, Mockito, WireMock, slf4j-simple — see their respective
diff --git a/README.md b/README.md
index 81c6d1f0..116e445d 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# DocTruth - Auditable LLM Extraction for Java
+# DocTruth - Rust-Core Document Evidence Runtime
@@ -14,9 +14,13 @@
[](https://github.com/doctruthhq/DocTruth/actions)
[](#installation)
[](LICENSE)
-[](https://openjdk.org)
+[](runtime/doctruth-runtime)
+[](https://openjdk.org)
-**Auditable LLM extraction for Java.** DocTruth turns PDFs, DOCX, XLSX, and CSV files into schema-bound structured output with field-level source citations, optional PDF bounding boxes, confidence scores, provenance, and PROV-O audit JSON.
+**DocTruth is a Rust-core document evidence runtime with Java SDK/CLI wrappers.**
+It turns PDFs and other documents into schema-bound structured output with
+field-level source citations, optional PDF bounding boxes, confidence scores,
+provenance, and PROV-O audit JSON.
DocTruth is for teams that need to answer one question reliably:
@@ -24,7 +28,14 @@ DocTruth is for teams that need to answer one question reliably:
The core boundary is simple: source document in, validated structured output plus evidence trail out.
-It is framework-agnostic and fits into plain Java, Spring Boot, LangChain4j, Spring AI, Quarkus, Micronaut, or any Java service that already calls OpenAI, Anthropic, Gemini, DeepSeek, or an OpenAI-compatible model endpoint.
+The parser/runtime core lives in [`runtime/doctruth-runtime`](runtime/doctruth-runtime).
+Java is the integration wrapper: SDK, CLI, API compatibility, packaging, and
+enterprise lifecycle. Java/PDFBox is legacy/oracle only and is not the default
+parser path.
+
+DocTruth is framework-agnostic and fits into plain Java, Spring Boot,
+LangChain4j, Spring AI, Quarkus, Micronaut, or any service that already calls
+OpenAI, Anthropic, Gemini, DeepSeek, or an OpenAI-compatible model endpoint.
```text
contract.pdf
@@ -36,7 +47,15 @@ contract.pdf
## Installation
-Requires Java 25+. Use in a Maven project:
+The main parser path requires the Rust runtime. Release tarballs and the
+installed CLI include `doctruth-runtime` and set `DOCTRUTH_RUNTIME_COMMAND`
+automatically. Direct Maven/JAR usage should set it explicitly:
+
+```bash
+export DOCTRUTH_RUNTIME_COMMAND=/path/to/doctruth-runtime
+```
+
+The Java wrapper requires Java 25+. Use in a Maven project:
```xml
@@ -73,6 +92,19 @@ import java.time.LocalDate;
record Contract(String partyA, String partyB, LocalDate effectiveDate, BigDecimal totalValue) {}
+var trustDoc = DocTruth.withOpenAi(System.getenv("OPENAI_API_KEY"))
+ .parsePdf(Path.of("contract.pdf"))
+ .withParser(ParserPreset.STANDARD)
+ .parse();
+
+System.out.println(trustDoc.toMarkdownClean());
+System.out.println(trustDoc.toJsonEvidence());
+```
+
+The legacy extraction wrapper can still bind a parsed document to an LLM schema
+while the TrustDocument-native extraction API converges:
+
+```java
var result = DocTruth.withOpenAi(System.getenv("OPENAI_API_KEY"))
.fromPdf(Path.of("contract.pdf"))
.extract("Extract the contract terms", Contract.class)
@@ -101,9 +133,11 @@ The CLI is for first-run inspection, parser debugging, schema checks, and CI
smoke tests. Parser and schema inspection do not require an LLM key.
```bash
+cargo build --manifest-path runtime/doctruth-runtime/Cargo.toml --release
mvn package -DskipTests
-java -jar target/doctruth-java-0.2.0-alpha-all.jar parse contract.pdf --bboxes
-java -jar target/doctruth-java-0.2.0-alpha-all.jar parse contract.pdf --json -o parsed.json
+export DOCTRUTH_RUNTIME_COMMAND="$PWD/runtime/doctruth-runtime/target/release/doctruth-runtime"
+java -jar target/doctruth-java-0.2.0-alpha-all.jar parse contract.pdf
+java -jar target/doctruth-java-0.2.0-alpha-all.jar parse contract.pdf --format json -o trust-document.json
java -jar target/doctruth-java-0.2.0-alpha-all.jar schema contract.schema.json
```
@@ -125,8 +159,8 @@ doctruth version
-- Parses PDF, DOCX, XLSX, and CSV into sections with source locations; PDF text sections include page-normalized bounding boxes when layout data is available.
-- Extracts Java records or JSON Schema-bound objects through LLM providers.
+- Parses documents through the Rust runtime into source-grounded evidence units; PDF text sections include page-normalized bounding boxes when layout data is available.
+- Extracts Java records or JSON Schema-bound objects through LLM providers via the Java wrapper.
- Validates structured output locally and retries repairable failures.
- Matches extracted fields back to exact source quotes.
- Returns per-field `Citation`, including source location and optional PDF bounding box, plus `Confidence` and `Provenance`.
@@ -194,7 +228,9 @@ var local = DocTruth.withProvider(LlmProviders.openAiCompatible(
```bash
doctruth init
-doctruth parse contract.pdf --bboxes
+doctruth parse contract.pdf
+doctruth parse contract.pdf --format json -o trust-document.json
+doctruth ingest-audit ./resumes --json -o ingest-audit.json
doctruth schema contract.schema.json
doctruth doctor
doctruth extract contract.pdf -s contract.schema.json
@@ -222,7 +258,7 @@ doctruth audit .doctruth/runs//audit.json
- [OSS PMF gap](docs/oss-pmf-gap.md)
- [Release process](docs/release.md)
- Use cases:
- - [Auditable LLM extraction for Java](docs/use-cases/auditable-llm-extraction-java.md)
+ - [Auditable LLM extraction with the Java wrapper](docs/use-cases/auditable-llm-extraction-java.md)
- [Source citations for LLM output](docs/use-cases/source-citations-for-llm-output.md)
- [PDF extraction with bounding boxes](docs/use-cases/pdf-extraction-with-bounding-boxes.md)
- [Contributing](CONTRIBUTING.md)
diff --git a/docs/adr/0009-auditable-structured-extraction-engine-scope.md b/docs/adr/0009-auditable-structured-extraction-engine-scope.md
index 9fa55878..d1b2615d 100644
--- a/docs/adr/0009-auditable-structured-extraction-engine-scope.md
+++ b/docs/adr/0009-auditable-structured-extraction-engine-scope.md
@@ -90,7 +90,7 @@ not become generic core behavior.
| Jurisdiction-specific interpretation | Out of core | Legal/regulatory interpretation changes over time and should be owned by domain packages or applications. |
| SIEM, key-management, and residency integrations | Out of core | Organization-specific deployment policy. |
| Dashboard / auditor portal | Out of core | Application surface beyond the library. |
-| OCR engines and form-recognition models | Out of core by default | Heavy model/runtime choices should be pluggable rather than bundled. |
+| OCR model/runtime packages | Out of the generic jar by default | DocTruth core owns the `OcrEngine` SPI and local worker protocol; desktop/deployment packages carry heavy engines and model files. |
## Consequences
diff --git a/docs/adr/0010-rust-runtime-protocol-dependencies.md b/docs/adr/0010-rust-runtime-protocol-dependencies.md
new file mode 100644
index 00000000..b2b58ed5
--- /dev/null
+++ b/docs/adr/0010-rust-runtime-protocol-dependencies.md
@@ -0,0 +1,47 @@
+# ADR 0010: Rust Runtime Protocol Dependencies
+
+Status: accepted
+
+## Context
+
+DocTruth v1 introduces a Rust sidecar runtime boundary for the parser core. The
+Java SDK talks to this sidecar through a JSON stdin/stdout protocol. The runtime
+needs deterministic JSON parsing and rendering, plus process-level contract
+tests for the binary.
+
+## Decision
+
+Use these Rust dependencies in `runtime/doctruth-runtime`:
+
+```text
+lopdf direct page content operation parsing for simple bordered-table grids
+pdf-extract runtime text-layer PDF extraction for the first non-model baseline
+serde_json runtime JSON protocol parsing and rendering
+sha2 stable per-page runtime hash metadata
+assert_cmd dev-only binary contract tests
+predicates dev-only stdout/stderr assertions
+```
+
+`lopdf` is declared with `default-features = false`. The runtime only needs
+basic PDF object/content-stream parsing here; optional chrono, jiff, rayon, and
+time features are not part of the local sidecar baseline.
+
+`sha2` is used only for deterministic local metadata. The runtime does not yet
+render page images; the current Rust-side page hash is a stable hash over page
+content bytes and media-box dimensions, not a rendered PNG hash.
+
+The MVP intentionally does not add OCR, ONNX, model-assisted table, or Markdown
+rendering dependencies. Those will need separate ADRs because they affect
+runtime size, licensing, model provenance, and local installation behavior.
+
+## Consequences
+
+- The sidecar protocol is covered by executable-level tests instead of only unit
+ tests.
+- Runtime output remains standard JSON that the Java `SidecarParserBackend`
+ can consume.
+- The first Rust parser slice can extract text-layer PDFs but does not imply
+ layout/table/OCR quality claims.
+- The first Rust table slice can recover simple bordered-grid tables from PDF
+ drawing operations, but it does not imply borderless, merged-cell, multi-page,
+ OCR-backed, or model-assisted table quality claims.
diff --git a/docs/adr/0011-model-execution-worker-boundary.md b/docs/adr/0011-model-execution-worker-boundary.md
new file mode 100644
index 00000000..3dd159ea
--- /dev/null
+++ b/docs/adr/0011-model-execution-worker-boundary.md
@@ -0,0 +1,86 @@
+# ADR 0011: Model Execution Worker Boundary
+
+Status: accepted
+
+## Context
+
+DocTruth v1 keeps parser-quality ownership in the Java/OpenDataLoader-compatible
+core while moving model-worker and Python replacement ownership into
+`doctruth-runtime`. The legacy research stack was heterogeneous:
+
+```text
+RT-DETR/TATR ONNXRuntime artifacts and tensor decoders
+SLANeXT/PaddleOCR PaddleOCR plus PaddlePaddle runtime
+RapidOCR RapidOCR plus ONNXRuntime or MNN backends
+```
+
+Bundling all of these Python runtimes directly into the production parser path
+made the local runtime larger, harder to install, harder to license-audit, and
+less portable. It also forced users to reason about OCR/table/layout
+dependencies even when they only needed text-layer evidence.
+
+## Decision
+
+For DocTruth v1, Rust runtime-shell ownership means:
+
+```text
+doctruth-runtime owns warm parser process orchestration
+doctruth-runtime owns model manifest/cache validation
+doctruth-runtime owns source hash and request envelope construction
+doctruth-runtime owns worker response validation and normalization
+doctruth-runtime owns benchmark_corpus execution
+doctruth-runtime owns audit-grade warning propagation
+heavy model execution happens through Rust-owned local workers
+heavy model execution may happen in isolated local workers
+```
+
+The production model worker is a local, explicit, auditable Rust process
+connected through JSON stdin/stdout:
+
+```text
+runtime/doctruth-runtime/src/bin/doctruth-mnn-model-worker.rs
+bin/doctruth-mnn-model-worker
+```
+
+Legacy Python workers may remain in the source tree only as migration or
+differential-oracle tools. They are not installed by the default source install,
+are not included in release tarballs, and are not a production parser strategy.
+
+The Rust runtime must treat workers as implementation details behind its
+control plane. A successful model-assisted parse must still return a normalized
+`TrustDocument` with:
+
+```text
+parserRun.backend = rust-sidecar+model-worker
+parserRun.workerBackend = original worker backend
+parserRun.runtime = doctruth-runtime
+parserRun.models = required model identities
+```
+
+## Consequences
+
+- The CLI is Rust-shell-first without bundling PaddleOCR, PaddlePaddle,
+ RapidOCR, or ONNXRuntime Python environments into the production package.
+- Release packages include the Rust runtime and Rust MNN worker, not Python
+ worker adapters.
+- Real MNN inference remains behind the Rust worker implementation and model
+ manifest/cache contract; replacing the protocol stub with actual MNN calls is
+ an implementation task, not a license to reintroduce Python production
+ residency.
+- In-process Rust model execution remains a future optimization.
+- Parser accuracy remains owned by the Java/OpenDataLoader-compatible quality
+ core until benchmark parity proves a replacement. Passing generated
+ real-route smokes proves integration, not production accuracy.
+
+## Verification
+
+The accepted worker boundary is covered by:
+
+```text
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml
+sh scripts/smoke-doctruth-runtime-model-worker.sh
+```
+
+These tests and smokes prove the Rust runtime controls the model-assisted parse
+route and normalizes worker output. They do not replace broad human-reviewed
+parser accuracy corpora.
diff --git a/docs/architecture/auditable-structured-extraction-engine.md b/docs/architecture/auditable-structured-extraction-engine.md
index 0798d134..d2a04fa1 100644
--- a/docs/architecture/auditable-structured-extraction-engine.md
+++ b/docs/architecture/auditable-structured-extraction-engine.md
@@ -313,7 +313,7 @@ work:
| Region/data-residency enforcement | Customer-specific infrastructure policy. |
| Managed key pools and vendor-key rotation | Operational integration outside the single-jar library. |
| Compliance dashboard and auditor portal | Application surface for compliance teams, not a Java primitive. |
-| OCR engines and form-recognition models | Heavy runtime/model choices should be pluggable rather than bundled. |
+| OCR model/runtime packages | Heavy runtime/model choices should be pluggable; DocTruth core owns the `OcrEngine` SPI and local worker protocol, while desktop/deployment packages carry engines and model files. |
Rule of thumb:
diff --git a/docs/cli.md b/docs/cli.md
index 01e232fb..b77e4c60 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -1,10 +1,8 @@
# CLI
-DocTruth CLI is the try/debug/inspect entry point. The primary integration path
-is the Java SDK (`DocTruth.withOpenAi(...).fromPdf(...).extract(...).run()`),
-while the CLI is optimized for first-run evidence inspection: parse without an
-LLM key, check schemas directly, and write extraction outputs into a run
-directory.
+DocTruth CLI is the try/debug/inspect entry point for the Rust-core document
+evidence runtime. The Java SDK and CLI are wrappers; parser ownership lives in
+`runtime/doctruth-runtime`.
Build the standalone CLI jar:
@@ -54,30 +52,365 @@ No provider key required:
doctruth parse contract.pdf
```
-Prints a summary:
+Prints a TrustDocument parser summary:
```text
contract.pdf
pages: 3
-sections: 42
-text: 38
+units: 42
tables: 2
-figures: 0
-bbox coverage: 31/38
+parser backend: rust-sidecar
+audit grade: AUDIT_GRADE
```
-Write parsed sections as JSON:
+By default the CLI uses `--backend auto`, which requires the local Rust runtime.
+Installed release launchers set `DOCTRUTH_RUNTIME_COMMAND` automatically. Direct
+jar usage must set `DOCTRUTH_RUNTIME_COMMAND` or pass `--runtime `.
+Missing Rust runtime is an installation/configuration error, not a Java/PDFBox
+fallback. Use `--backend pdfbox` only for legacy/oracle comparison.
```bash
-doctruth parse contract.pdf --json -o parsed.json
+doctruth parse contract.pdf --format json
+doctruth parse contract.pdf --json -o trust-document.json
+doctruth parse contract.pdf --markdown -o parsed.md
+DOCTRUTH_RUNTIME_COMMAND=./doctruth-runtime doctruth parse contract.pdf --format json
+doctruth parse contract.pdf --backend pdfbox --format json
```
+`--json` and `--markdown` are Rust TrustDocument aliases, not legacy
+ParsedDocument aliases. The old Java/PDFBox `ParsedDocument` shapes are
+available only for explicit compatibility/oracle runs:
+
+```bash
+doctruth parse contract.pdf --backend pdfbox --format legacy-json -o parsed.legacy.json
+doctruth parse contract.pdf --backend pdfbox --format legacy-markdown -o parsed.legacy.md
+```
+
+Write a clean plain-text consumption view for LLM/RAG cleanup steps:
+
+```bash
+doctruth parse contract.pdf --format plain -o parsed.txt
+```
+
+Plain text keeps the parser reading order and table row/column content, but
+does not include Markdown table separators, evidence anchors, bbox metadata, or
+hashes. Use JSON or Markdown plus `--source-map` when the downstream consumer
+needs audit-grade evidence links.
+
+Write compact evidence wire output for LLM/RAG context:
+
+```bash
+doctruth parse contract.pdf --format compact -o context.doctruth-wire
+```
+
+Compact output keeps document id, source hash, unit ids, evidence span ids,
+table ids, warnings, and optional `bbox=` metadata for citeable units while
+remaining materially smaller than full JSON. When `--out` is used, compact
+output is written through the streaming writer path rather than first rendering
+the full wire document into one aggregate string.
+
+Compact output can also emit a source-map sidecar:
+
+```bash
+doctruth parse contract.pdf --format compact --source-map -o context.doctruth-wire
+```
+
+The compact source map records rendered offsets for compact unit text fields,
+so LLM/RAG context can be tied back to unit ids and evidence span ids.
+
+Verify that a rendered Markdown file still matches its source-map sidecar:
+
+```bash
+doctruth verify-source-map parsed.md parsed.doctruth-map.json --source contract.pdf
+```
+
+This recomputes the rendered content hash and, when `--source` is supplied, the
+source document hash. It fails if the Markdown or source document has been
+changed after the source map was generated.
+
+Write a hashable audit package for compliance/replay systems:
+
+```bash
+doctruth parse contract.pdf --format audit -o audit.json
+```
+
+Verify the audit package against the canonical full TrustDocument JSON:
+
+```bash
+doctruth parse contract.pdf --format json --profile full -o trust-document.json
+doctruth verify-audit trust-document.json audit.json
+```
+
+Audit JSON includes the source hash, canonical `TrustDocument` hash, evidence
+hash, parser run metadata, audit-grade status, and evidence units. It is
+hashable and replay-friendly. `verify-audit` fails if the audit package no
+longer matches the canonical document, source hash, canonical hash, evidence
+hash, parser run metadata, or evidence payload. It is not yet an externally
+signed or timestamped audit package.
+
+Write a page-aware HTML review surface for bbox overlays:
+
+```bash
+doctruth parse contract.pdf --format html -o review.html
+```
+
+HTML review output includes page containers with page number, dimensions,
+text-layer availability, page image hash, nested unit/table/cell anchors, and
+page-scoped bbox overlay nodes for units, tables, and cells. It is intended for
+local evidence review and overlay tooling, not as a full
+hosted auditor UI.
+
+Write a local review package for visual parser QA:
+
+```bash
+doctruth review-package contract.pdf -o .doctruth/reviews/contract
+```
+
+The package includes `review.html`, `trust-document.json`, page PNG artifacts,
+and `page-images.json`. Phase 250 also writes layered trace artifacts:
+`content_blocks.json`, `parse_trace.json`, `layout-debug.html`, and
+`span-debug.html`. `content_blocks.json` is the flat reading-order block stream.
+`parse_trace.json` is the page/block/line/span evidence layer. The two debug
+HTML files carry `data-trace-block-id`, `data-trace-line-id`, and
+`data-trace-span-id` attributes whose ids match the corresponding entries in
+`parse_trace.json`, so reviewers can inspect layout and span overlays against
+the same trace ids used by the machine-readable trace.
+
+This closes the review-package visual trace artifact contract. It does not
+claim that Rust-native real model/OCR execution or the broad human-reviewed
+parser accuracy corpus are complete; those remain pending.
+
Show that bbox recovery is enabled in the summary:
```bash
doctruth parse contract.pdf --bboxes
```
+### Ingest Audit
+
+Run a no-LLM PDF corpus audit before extraction:
+
+```bash
+doctruth ingest-audit ./resumes --json -o ingest-audit.json
+```
+
+This walks local PDFs and reports parser-layer gaps only: pages that should be
+routed to OCR before DocTruth block assembly, oversized blocks, missing headings,
+missing text bboxes, and parse failures. It does not call providers or OCR
+engines and does not include recovered document text in the JSON.
+
+### Benchmark Corpus
+
+Run a labeled parser benchmark corpus with metric thresholds:
+
+```bash
+doctruth benchmark-corpus parser-corpus.json
+doctruth benchmark-corpus parser-corpus.json --json
+doctruth benchmark-corpus parser-corpus.json --json --report-out parser-report.json
+doctruth benchmark-corpus parser-corpus.json --offline
+doctruth verify-benchmark-report parser-report.json
+```
+
+The corpus manifest resolves paths relative to itself and requires each case to
+provide:
+
+```text
+source
+or sourceUrl + sourceSha256
+expectedMarkdown
+expectedDocument
+```
+
+Use `--report-out ` for recorded parser-quality runs. The report is
+the machine-readable benchmark result plus `reportFormat` and the resolved
+manifest path with `manifestSha256`. It also copies the `minimums` and
+`maximums` thresholds used for the run and records actual `caseCount` plus
+`casesPerTag` coverage from the cases that ran. Per-case entries include label
+id, coverage tags, metrics, and `sourceSha256` when the manifest pins the source
+PDF, so parser-accuracy evidence can be archived instead of relying on terminal
+output.
+
+Use `verify-benchmark-report ` to verify a recorded report without
+rerunning the parser. The verifier checks the report format, pass status,
+manifest path, `manifestSha256`, copied threshold objects, actual
+`caseCount`/`casesPerTag`, copied coverage thresholds such as
+`minCasesPerTag`/`minTotalCases`, metric values against `minimums`/`maximums`,
+aggregate metrics recomputed from case-level metric evidence, and source-hash
+pins echoed from the manifest.
+
+Use top-level `minimums` for higher-is-better metrics such as
+`reading_order_f1`, `quote_anchor_accuracy`, `bbox_iou`, and `table_cell_f1`.
+Use top-level `maximums` for lower-is-better metrics such as
+`strict_warning_false_negative_rate` and aggregate runtime gates such as
+`parser_latency_p95`.
+
+`--json` emits corpus-level aggregate metrics under top-level `metrics`,
+including `parser_latency_p50` and `parser_latency_p95`, and per-case metrics
+under each case.
+
+Use `--offline` to require cache-only execution for remote `sourceUrl` cases.
+Uncached remote fixtures fail before any network request; previously cached
+fixtures are still verified by `sourceSha256` before parsing.
+
+`source` is a manifest-relative local path. `sourceUrl` downloads a remote
+fixture into `.doctruth-corpus-cache` next to the manifest and requires
+`sourceSha256` in `sha256:` form before parsing. `expectedDocument` is the
+lossless `TrustDocument` JSON label. The command reuses the SDK benchmark
+metrics and exits non-zero when any configured minimum threshold fails.
+
+### Local OCR
+
+`doctruth parse` uses the Rust runtime path by default for both normal
+text-layer PDFs and OCR/model-assisted presets. For OCR work, the Rust runtime
+routes through the local model worker protocol before DocTruth block assembly.
+The production worker protocol is JSON over stdin/stdout and is owned by the
+Rust runtime. Source installs and release tarballs include
+`doctruth-mnn-model-worker`; they do not package Python RapidOCR, SLANeXT, or
+ONNX workers as production entrypoints.
+
+For v1 `TrustDocument` outputs, use the OCR preset explicitly:
+
+```bash
+doctruth parse scanned.pdf --format json --preset ocr -o scanned.trust.json
+doctruth review-package scanned.pdf --preset ocr -o .doctruth/reviews/scanned
+```
+
+Those commands emit `parserRun.backend=rust-sidecar+model-worker` when routed
+through the Rust runtime, include the selected MNN model identity in parser
+models, and mark recovered text units as `OCR_REGION`. OCR page confidence is
+copied into the unit evidence. If the worker returns confidence below `0.85`,
+the unit receives a severe `ocr_low_confidence` warning and the document is
+`NOT_AUDIT_GRADE`; the text is still present for review and replay.
+
+Discovery order:
+
+```bash
+DOCTRUTH_RUNTIME_MODEL_COMMAND=/path/to/doctruth-mnn-model-worker
+DOCTRUTH_MODEL_COMMAND=/path/to/doctruth-mnn-model-worker
+doctruth-mnn-model-worker on PATH
+DOCTRUTH_MODEL_CACHE=/path/to/model-cache
+DOCTRUTH_MODEL_MANIFEST=/path/to/models.json
+DOCTRUTH_OCR_TIMEOUT_MS=30000
+```
+
+The same values can be supplied as JVM properties, for example
+`-Ddoctruth.model.command=/path/to/doctruth-mnn-model-worker`.
+
+The worker `--doctor` command verifies the Rust MNN protocol entrypoint:
+
+```bash
+doctruth-mnn-model-worker --doctor
+```
+
+The doctor reports `runtime=mnn`, `engine=mnn`, protocol version,
+`protocolReady=true`, `inferenceReady=false`, and
+`productionPythonResidency=false` until real MNN inference is wired. It also
+reports `nativeBackend.compiled`; this is `false` in the default build and
+`true` only when built with the optional `mnn-native` Cargo feature. Model files
+are packaged with the client runtime or supplied through `DOCTRUTH_MODEL_CACHE`
+and `DOCTRUTH_MODEL_MANIFEST`; they are not bundled in the generic Java jar.
+`DOCTRUTH_MNN_WORKER_STUB=1` is reserved for local contract smokes. Stub output
+is explicitly marked `NOT_AUDIT_GRADE` and must not be treated as production
+inference.
+
+The optional native MNN probe verifies real Rust-side MNN session creation and
+inference with a supplied executable `.mnn` model:
+
+```bash
+DOCTRUTH_MNN_NATIVE_PROBE_MODEL=/path/to/model.mnn \
+ scripts/smoke-doctruth-mnn-native-probe.sh
+```
+
+`--doctor` only proves the worker protocol. `--probe-model` proves native MNN
+loading/session/inference. MNN benchmark or shape-only artifacts that have
+weights stripped are not valid inference acceptance models.
+
+DocTruth ships model manifests, not binary model files. To fetch the default
+PP-OCRv5 mobile MNN OCR model pack into the local cache:
+
+```bash
+scripts/fetch-doctruth-model-pack.py \
+ --manifest model-packs/ppocr-v5-mobile-mnn.json \
+ --cache .doctruth/models
+
+DOCTRUTH_MODEL_MANIFEST=model-packs/ppocr-v5-mobile-mnn.json \
+DOCTRUTH_MODEL_CACHE=.doctruth/models \
+ doctruth-runtime --doctor
+```
+
+To fetch the public OpenDataLoader-style hybrid model references used for
+layout/table parity work:
+
+```bash
+scripts/fetch-doctruth-model-pack.py \
+ --manifest model-packs/opendataloader-hybrid-models.json \
+ --cache .doctruth/models
+```
+
+`opendataloader-hybrid-models.json` pins the public RT-DETR layout and
+TATR-compatible table artifacts used by DocTruth's migration harness. The
+historical OpenDataLoader `table_transformer` branch calls an external TATR
+HTTP service; that service repository is not currently publicly fetchable, so
+DocTruth does not claim it has vendored that private service. It normalizes
+public model outputs through `TrustDocument`.
+
+Every model artifact must carry a preprocessing and parity contract. Before
+promoting a converted MNN/C++ decoder, dump the Python/ONNX reference input
+tensor and the Rust/MNN candidate input tensor for the same image and compare
+the first values plus the raw float32 tensor hash:
+
+```bash
+scripts/doctruth-preprocess-tensor-probe.py \
+ --manifest model-packs/opendataloader-hybrid-models.json \
+ --preset table-lite \
+ --model xenova-table-transformer-structure-recognition \
+ --image page.png \
+ --first 32
+```
+
+The candidate MNN worker must emit the same shape, channel order, resize,
+mean/std normalization, first tensor values, and tensor hash within the
+manifest's `parity.maxAbsDiff`. Most conversion regressions are preprocessing
+drift, not inference engine drift; RGB/BGR order, resize policy, scale, mean,
+and std are part of the acceptance contract.
+
+For a Rust MNN worker, package these model files with the client runtime:
+
+```bash
+DOCTRUTH_OCR_DET_MODEL=/path/to/ocr/det_model.mnn
+DOCTRUTH_OCR_REC_MODEL=/path/to/ocr/rec_model.mnn
+DOCTRUTH_OCR_KEYS_PATH=/path/to/ocr/ppocr_keys.txt
+```
+
+### Legacy Python Model Workers
+
+Legacy Python model workers in `scripts/` are oracle-only migration tools. They
+fail closed unless `DOCTRUTH_ALLOW_PYTHON_ORACLE=1` is set by an explicit test
+or comparison harness. Do not configure them as production local workers.
+
+The repository still keeps RapidOCR, SLANeXT/PaddleOCR, and ONNXRuntime Python
+worker scripts as legacy migration or differential-oracle tools. They are not
+installed by `scripts/install-cli.sh`, are not included in release tarballs, and
+are not the production parser path. Use them only when explicitly comparing old
+behavior or validating a migration fixture.
+
+To validate a user-supplied legacy model artifact, write a model manifest with
+`source`, `sha256`, `task`, and the legacy runtime fields, then run the opt-in
+source-tree smoke:
+
+```bash
+DOCTRUTH_REAL_MODEL_MANIFEST=models.json \
+DOCTRUTH_REAL_MODEL_PRESET=table-lite \
+DOCTRUTH_REAL_MODEL_EXPECTED_ID=tatr:v1 \
+DOCTRUTH_REAL_MODEL_EXPECTED_TASK=table-structure-recognition \
+scripts/smoke-doctruth-real-model-artifact.sh
+```
+
+The smoke skips when `DOCTRUTH_REAL_MODEL_MANIFEST` is not set. These legacy
+smokes do not change the production contract: parser quality and release
+packaging must flow through Rust-owned runtime behavior normalized into
+`TrustDocument`.
+
### Schema
Check a JSON Schema:
@@ -152,6 +485,96 @@ doctruth doctor --json
```
`doctor` does not call an LLM. It is safe to run before configuring extraction.
+It also reports local OCR worker readiness: resolved worker command, `mnn`
+engine setting, fallback engine, timeout, and whether OCR is disabled. This is
+an executable/protocol readiness check; a raw `rapidocr` command is not assumed
+to be a compatible worker unless it is wrapped behind DocTruth's JSON
+stdin/stdout worker protocol.
+
+### Cache Warm
+
+Warm a local parser model cache from a manifest before using a model-assisted
+preset:
+
+```bash
+doctruth cache warm models.json --preset table-lite --cache .doctruth/models --json
+```
+
+The manifest is keyed by parser preset id and can reference local files,
+`file://` URLs, or HTTP(S) URLs:
+
+```json
+{
+ "presets": {
+ "table-lite": [
+ {
+ "name": "slanet-plus",
+ "version": "local",
+ "source": "models/slanet.onnx",
+ "sha256": "sha256:...",
+ "sizeBytes": 123456,
+ "required": true,
+ "task": "table-structure",
+ "backend": "onnxruntime",
+ "format": "onnx",
+ "precision": "int8",
+ "license": "apache-2.0"
+ }
+ ]
+ }
+}
+```
+
+`cache warm` copies local sources or downloads HTTP(S) sources into the
+standard DocTruth cache filename for that model, then verifies SHA-256 through
+the same model-cache verifier used by MCP and model-worker requests.
+`--offline` refuses remote sources before any network request. Runtime hint
+fields are preserved in `cache warm --json`, `doctor --json`, and the local
+model-worker request; they describe how a real worker should load the artifact,
+but do not make DocTruth execute ONNX by themselves.
+
+### MCP
+
+Run a local stdio MCP server for agent-side document evidence access:
+
+```bash
+doctruth mcp
+```
+
+The bundled skill package can write a local MCP config snippet:
+
+```bash
+skills/doctruth/scripts/bootstrap-local-mcp.sh --command doctruth --print-json
+```
+
+Supported tools:
+
+```text
+doctruth.parse_document
+doctruth.get_layout_regions
+doctruth.get_table_cells
+doctruth.get_evidence_span
+doctruth.verify_citation
+doctruth.warm_model_cache
+```
+
+`doctruth.parse_document` accepts a local `path`, optional `preset`, optional
+`format` (`compact_llm`, `json_evidence`, or `json_full`), and optional
+`sourceMap`. The tool returns MCP `structuredContent` with compact LLM text,
+JSON evidence units, bbox-bearing unit locations, and a source map when
+requested. This is a local stdio gateway over the same parser contracts used by
+the CLI and SDK.
+
+The evidence tools all accept a local `path` and optional `preset`.
+`doctruth.get_layout_regions` returns citeable units with page, reading order,
+evidence span ids, text, and bbox anchors. `doctruth.get_table_cells` returns
+structured tables and cell-level bboxes. `doctruth.get_evidence_span` returns
+the unit backing a requested `evidenceSpanId`. `doctruth.verify_citation`
+checks a caller-supplied `quote` against an `evidenceSpanId` and returns a
+boolean verification plus match score. `doctruth.warm_model_cache` verifies a
+caller-supplied local model cache directory and expected model descriptors
+before model-assisted parsing; it reports READY/MISSING/SHA_MISMATCH without
+downloading models.
### Completion
diff --git a/docs/homebrew.md b/docs/homebrew.md
index c059e3bc..57cbeffb 100644
--- a/docs/homebrew.md
+++ b/docs/homebrew.md
@@ -52,6 +52,7 @@ Smoke the generated tarball:
mkdir -p /tmp/doctruth-release-smoke
tar -xzf dist/doctruth-0.2.0-alpha.tar.gz -C /tmp/doctruth-release-smoke
JAVA=/path/to/java /tmp/doctruth-release-smoke/doctruth-0.2.0-alpha/bin/doctruth version
+JAVA=/path/to/java /tmp/doctruth-release-smoke/doctruth-0.2.0-alpha/bin/doctruth-runtime --doctor
```
## Why The Formula Is Not Committed As A Live Formula Here
diff --git a/docs/install.md b/docs/install.md
index a2def5f3..95ef2f99 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -1,8 +1,7 @@
# Install DocTruth CLI
-The Java SDK is the primary production integration path. The CLI is the
-try/debug/inspect path: it lets a Java team verify the core promise before
-writing integration code:
+DocTruth's parser core is the Rust runtime. The Java SDK and CLI are wrappers
+for application integration, packaging, and first-run inspection:
```text
document -> parsed sections with source locations -> schema check -> audit output
@@ -10,7 +9,7 @@ document -> parsed sections with source locations -> schema check -> audit outpu
## SDK Install
-Use the SDK when adding DocTruth to an application:
+Use the Java wrapper SDK when adding DocTruth to an application:
```xml
@@ -20,14 +19,19 @@ Use the SDK when adding DocTruth to an application:
```
-Minimal application flow:
+Set the Rust runtime command for direct Maven/JAR usage:
+
+```bash
+export DOCTRUTH_RUNTIME_COMMAND=/path/to/doctruth-runtime
+```
+
+Minimal TrustDocument parser flow:
```java
-var result = DocTruth.withOpenAi(System.getenv("OPENAI_API_KEY"))
- .fromPdf(Path.of("contract.pdf"))
- .extract("Extract contract terms", Contract.class)
- .withEvidence()
- .run();
+var trustDoc = DocTruth.withOpenAi(System.getenv("OPENAI_API_KEY"))
+ .parsePdf(Path.of("contract.pdf"))
+ .withParser(ParserPreset.STANDARD)
+ .parse();
```
## CLI From Source
@@ -46,9 +50,11 @@ Run it directly:
java -jar target/doctruth-java-0.2.0-alpha-all.jar --help
```
-Install a `doctruth` launcher:
+Install a `doctruth` launcher, the Rust parser runtime, and the Rust MNN model
+worker:
```bash
+cargo build --manifest-path runtime/doctruth-runtime/Cargo.toml --release --bins
scripts/install-cli.sh --prefix "$HOME/.local"
```
@@ -63,9 +69,18 @@ Check the install:
```bash
doctruth version
doctruth doctor
-doctruth parse fixtures/pdf/ResumeAFIQDANISH.pdf --bboxes
+doctruth-runtime --doctor
+doctruth-mnn-model-worker --doctor
+doctruth parse fixtures/pdf/ResumeAFIQDANISH.pdf --format json
```
+The installed `doctruth` launcher discovers `bin/doctruth-runtime` and exports
+`DOCTRUTH_RUNTIME_COMMAND` automatically. It also discovers
+`bin/doctruth-mnn-model-worker` and exports `DOCTRUTH_RUNTIME_MODEL_COMMAND`
+and `DOCTRUTH_MODEL_COMMAND` automatically. TrustDocument parse formats use the
+Rust runtime by default after install. Use `--backend pdfbox` only for
+legacy/oracle comparison during migration or regression debugging.
+
If `java` is not on `PATH`, point the launcher at your Java 25 runtime:
```bash
@@ -86,8 +101,9 @@ java -version
No provider key is required for parser and schema inspection:
```bash
-doctruth parse contract.pdf --bboxes
-doctruth parse contract.pdf --json -o parsed.json
+doctruth parse contract.pdf
+doctruth parse contract.pdf --format json -o trust-document.json
+doctruth ingest-audit ./resumes --json -o ingest-audit.json
doctruth schema contract.schema.json
```
@@ -120,13 +136,36 @@ checksums.txt
doctruth.rb
```
-Use the tarball when you want a `bin/doctruth` launcher plus the bundled jar:
+Use the tarball when you want a `bin/doctruth` launcher, `bin/doctruth-runtime`,
+`bin/doctruth-mnn-model-worker`, and the bundled jar:
```bash
tar -xzf doctruth-0.2.0-alpha.tar.gz
JAVA=/path/to/java ./doctruth-0.2.0-alpha/bin/doctruth version
```
+Release tarballs do not include RapidOCR, SLANeXT/PaddleOCR, or ONNXRuntime
+Python worker scripts. Those scripts remain in the source tree only as
+legacy/oracle tools for migration comparisons. Production release packaging is
+Rust runtime plus Rust MNN model worker. OCR/model files are not bundled inside
+the Java jar; provide them through the local runtime package or
+`DOCTRUTH_MODEL_CACHE` plus `DOCTRUTH_MODEL_MANIFEST`.
+
+The release launcher also discovers its same-directory `doctruth-runtime` and
+`doctruth-mnn-model-worker`, then sets `DOCTRUTH_RUNTIME_COMMAND`,
+`DOCTRUTH_RUNTIME_MODEL_COMMAND`, and `DOCTRUTH_MODEL_COMMAND` automatically,
+so packaged CLI parsing is Rust-first without extra environment setup.
+
+Real layout/table model artifacts are not bundled. Use a manifest and the
+opt-in real model smoke to validate a local artifact before relying on it:
+
+```bash
+DOCTRUTH_REAL_MODEL_MANIFEST=models.json \
+DOCTRUTH_REAL_MODEL_PRESET=standard \
+DOCTRUTH_REAL_MODEL_EXPECTED_TASK=layout-detection \
+scripts/smoke-doctruth-real-model-artifact.sh
+```
+
Use the all-jar when you want the simplest direct invocation:
```bash
diff --git a/docs/parser-capability-matrix.md b/docs/parser-capability-matrix.md
index 08b6e2c0..b5401c3b 100644
--- a/docs/parser-capability-matrix.md
+++ b/docs/parser-capability-matrix.md
@@ -3,18 +3,66 @@
DocTruth parsing exists to preserve evidence anchors for extraction. It is not a
general document conversion product.
+## Runtime Status
+
+`doctruth-runtime` is now an active Rust-controlled runtime, not only a future
+placeholder. It owns `parse_pdf`, `benchmark_corpus`,
+`verify_benchmark_report`, `--doctor`, model-worker request handoff, layered
+`TrustDocument` outputs, and real-route smokes for runtime, corpus, OCR, table,
+and model-worker paths.
+
+Rust is the default parser core. Packaged CLI installs wire
+`DOCTRUTH_RUNTIME_COMMAND` automatically, and direct SDK/JAR usage must configure
+the runtime explicitly. Java/PDFBox is legacy/oracle-only and must be selected
+explicitly for migration or differential testing. Heavy layout, table, and OCR
+model execution remains local-worker and opt-in; those smokes prove integration
+through the real route, not broad production parser accuracy.
+
+## OpenDataLoader Parity Gate
+
+OpenDataLoader parity is measured, not asserted. A behavior is considered
+ported only when it has a Rust contract test, an upstream source reference, and
+either a focused OpenDataLoader Bench case or a full200 report showing the
+effect. Until full200 reaches the accepted baseline, DocTruth should be
+described as OpenDataLoader-inspired and progressively porting parity, not
+OpenDataLoader-equivalent.
+
+OpenDataLoader Bench is the external parser-quality foundation for reading
+order, heading hierarchy, table fidelity, and parser speed. DocTruth's
+`TrustDocument`, source refs, quote hashes, parser warnings, and replay gates
+remain canonical; OpenDataLoader artifacts are comparison inputs, not canonical
+DocTruth output.
+
| Source | Text Anchor | Visual Anchor | Current Notes |
| --- | --- | --- | --- |
| PDF text | page, line, char offset | optional page-normalized bbox | Best-supported path for reviewer highlights |
-| PDF scanned image | future OCR adapter | future OCR bbox | Not a built-in OCR engine today |
+| PDF scanned image | OCR adapter via `OcrEngine` SPI | OCR bbox when regions are supplied | Low-text pages route before DocTruth block assembly; CLI auto-discovers local OCR workers when packaged |
| DOCX | paragraph-style logical sections | none | Word pagination is not stable without a renderer |
| XLSX | sheet/row-style logical sections | none | Cell-level bbox is future work |
| CSV | row/column-style logical sections | none | Logical tabular evidence only |
-| PDF tables | section-level source location | future table/cell bbox | Table geometry is not yet a public contract |
+| PDF tables | table/cell source object ids | table/cell page-normalized bbox when detected | Generated bordered-grid, conservative borderless aligned text, horizontal colspan, and vertical rowspan fixtures are covered; model-assisted and real-world labeled table accuracy remain future work |
+
+## Output Profiles
+
+| Profile | Consumer | Evidence contract |
+| --- | --- | --- |
+| `json_full` | SDKs, audit storage, replay packages | Full trust document with evidence spans, source hashes, warnings, parser run, and audit grade |
+| `json_evidence` | audit pipelines that only need evidence-bearing content | Evidence-bearing subset |
+| `markdown_clean` | LLM/RAG document consumption | Readable Markdown without inline evidence syntax; pair with a source map when audit lookup is needed |
+| `plain_text` | cleanup, keyword search, and simple LLM context | Clean text and tab-separated table rows only; not an audit artifact by itself |
+| `compact_llm` | token-efficient LLM/RAG context | Compact deterministic wire format with evidence ids and warnings |
+| `html_review` | local evidence review UI | Review anchors suitable for bbox overlays and table/cell inspection |
Rules:
- `SourceLocation` is the durable audit anchor.
- `BoundingBox` is an optional visual anchor for PDF-originated text.
- Absence of bbox does not mean absence of evidence.
-- Scanned PDFs should be routed to OCR before relying on DocTruth extraction.
+- Scanned PDFs should be routed through the Rust model-worker path before
+ DocTruth block assembly.
+- The CLI discovers the production local model worker via
+ `DOCTRUTH_RUNTIME_MODEL_COMMAND`, `DOCTRUTH_MODEL_COMMAND`, or
+ `doctruth-mnn-model-worker` on `PATH`. Legacy Python OCR/table/model worker
+ names remain source-tree oracle tools only. OCR/table models stay in the
+ desktop/deployment package or local model cache, not in the generic Java
+ parser jar.
diff --git a/docs/parser/opendataloader-bench-runbook.md b/docs/parser/opendataloader-bench-runbook.md
new file mode 100644
index 00000000..cb6e2f63
--- /dev/null
+++ b/docs/parser/opendataloader-bench-runbook.md
@@ -0,0 +1,102 @@
+# OpenDataLoader Java Core Bench Runbook
+
+`scripts/run-opendataloader-java-core-parity.sh` is the local gate for the
+Java/OpenDataLoader-compatible parser core running behind the Rust benchmark
+shell. It reuses `scripts/run-doctruth-opendataloader-bench.sh`, which sends one
+`opendataloader_prediction` request to `doctruth-runtime`.
+
+## Smoke Gate
+
+Run:
+
+```bash
+bash scripts/run-opendataloader-java-core-parity.sh --smoke
+```
+
+The script builds the Java CLI jar once, builds the Rust runtime once, then runs
+one smoke prediction over a temporary OpenDataLoader Bench view containing the
+selected PDFs and ground-truth Markdown. This keeps one warm
+`opendataloader-java-core` backend process for the smoke prediction instead of
+looping over PDFs.
+
+Smoke artifacts are written under:
+
+```text
+third_party/opendataloader-bench/prediction/doctruth-java-core-/smoke/
+```
+
+The selected smoke set is recorded in `smoke-docs.tsv` beside the smoke output:
+
+| Fixture | Coverage |
+| --- | --- |
+| `01030000000001` | simple single column |
+| `01030000000145` | two-column |
+| `01030000000160` | sidebar/sidebar-like layout |
+| `01030000000083` | bordered table |
+| `01030000000127` | borderless table |
+| `01030000000165` | scanned/OCR fixture, only when local MNN OCR artifacts exist |
+
+The wrapper includes the scanned/OCR fixture when local MNN OCR artifacts exist,
+and the same smoke prediction still keeps one warm Java backend process. The
+current Java-core backend treats the preset as parser metadata and does not
+route scanned or sparse visual pages to the OCR model yet; OCR routing remains a
+focused model-runtime gate until Java-core OCR worker integration lands. If the
+fixture fails in this smoke, the smoke fails closed. That is intentional
+capability exposure. If the local MNN OCR manifest/cache are absent, the OCR
+fixture is skipped and `smoke-ocr-skip.txt` records the reason. The smoke gate
+still fails closed for any parsed/failed mismatch or invalid evaluation metrics.
+
+## Full200 Gate
+
+Run:
+
+```bash
+bash scripts/run-opendataloader-java-core-parity.sh --full200
+```
+
+`--full200` always runs smoke first. If smoke fails, the shell exits before the
+full200 run starts. If smoke passes, full200 artifacts are written under:
+
+```text
+third_party/opendataloader-bench/prediction/doctruth-java-core-/full200/
+```
+
+Do not run full200 as routine implementation verification. Use it for release
+gates or explicit benchmark acceptance work.
+
+## Report Fields To Check
+
+The benchmark output is split across the runner artifacts:
+
+| Field | Artifact |
+| --- | --- |
+| overall, NID, TEDS, MHS | `evaluation.json` at `metrics.score.overall_mean`, `nid_mean`, `teds_mean`, `mhs_mean` |
+| parsed and failed counts | `summary.json` at `parsed_count`, `failed_count`, `document_count` |
+| elapsed and mean ms/doc | `summary.json` at `total_elapsed`, `elapsed_per_doc` |
+| Java backend startup | `summary.json` at `javaBackendStartupMs` |
+| Java startup/RSS, Rust RSS, model worker RSS | `resources.json` when the runtime resource reporter emits it |
+| low-score buckets | `low-score-buckets.json` generated next to `evaluation.json` |
+| worst deltas | `reference-comparison.json` at `top_losses` |
+| bucket counts | `reference-comparison.json` at `summary.failure_buckets` |
+
+Full200 acceptance should inspect all fields together. Quality metrics without
+resource data are not enough for production-profile acceptance; resource data
+without OpenDataLoader metrics is not parser parity evidence.
+
+## Current Limitation
+
+The runtime `opendataloader_prediction` command currently accepts `doc_id`,
+`limit`, or an unbounded full-corpus request. It does not accept an arbitrary
+doc-id list or per-document presets. The smoke gate therefore creates a
+temporary bench directory with only the chosen smoke PDFs and ground-truth
+Markdown, then invokes the existing runner once over that selected corpus. This
+preserves the warm Java backend behavior for the actual smoke prediction while
+avoiding a per-document runner loop.
+
+Because one prediction invocation has one preset, the wrapper keeps the existing
+`lite` default for the selected smoke corpus. When local OCR artifacts are
+installed, the scanned/OCR fixture is included in that same prediction run based
+only on artifact availability. The current Java-core backend records the preset
+but still parses with the Java parser path and `OcrEngine.NOOP`, so this smoke
+does not claim OCR model routing. Explicit preset overrides still use one preset
+for the whole smoke corpus.
diff --git a/docs/parser/opendataloader-benchmark-gates.md b/docs/parser/opendataloader-benchmark-gates.md
new file mode 100644
index 00000000..80a4f9f3
--- /dev/null
+++ b/docs/parser/opendataloader-benchmark-gates.md
@@ -0,0 +1,93 @@
+# OpenDataLoader Benchmark Gates
+
+DocTruth can write OpenDataLoader Bench-compatible prediction artifacts through
+the Rust runtime `opendataloader_prediction` command. The command is intentionally
+bounded by default.
+
+## Full200 Guard
+
+`opendataloader_prediction` must not run every PDF in the OpenDataLoader Bench
+corpus unless the request explicitly allows it.
+
+When a request has neither `doc_id` nor `limit`, the runtime rejects the request
+unless `allow_full200` is set to `true`:
+
+`scripts/run-doctruth-opendataloader-bench.sh` is the intentional benchmark
+runner. Its default mode has neither `--doc-id` nor `--limit`, so the script
+injects `allow_full200: true` for that default full200 request. Bounded script
+runs keep omitting `allow_full200`.
+
+```json
+{
+ "command": "opendataloader_prediction",
+ "bench_dir": "third_party/opendataloader-bench",
+ "output_dir": "third_party/opendataloader-bench/prediction/doctruth-rust",
+ "engine": "doctruth-rust",
+ "preset": "edge-fast",
+ "profile": "edge-fast"
+}
+```
+
+The rejection is structured:
+
+```json
+{
+ "error_code": "FULL200_REQUIRES_EXPLICIT_ALLOW",
+ "message": "Set allow_full200=true to run the full OpenDataLoader Bench corpus"
+}
+```
+
+## Bounded Runs
+
+Single-document requests remain allowed without `allow_full200`:
+
+```json
+{
+ "command": "opendataloader_prediction",
+ "bench_dir": "third_party/opendataloader-bench",
+ "output_dir": "target/opendataloader-prediction-one",
+ "engine": "doctruth-rust-one",
+ "doc_id": "01030000000198",
+ "preset": "edge-fast",
+ "profile": "edge-fast"
+}
+```
+
+Small multi-document requests also remain allowed without `allow_full200` when
+they set `limit`:
+
+```json
+{
+ "command": "opendataloader_prediction",
+ "bench_dir": "third_party/opendataloader-bench",
+ "output_dir": "target/opendataloader-prediction-smoke",
+ "engine": "doctruth-rust-smoke",
+ "limit": 5,
+ "preset": "edge-fast",
+ "profile": "edge-fast"
+}
+```
+
+## Explicit Full200 Run
+
+Task 10 and release-gate style benchmark reports should opt in explicitly:
+
+```json
+{
+ "command": "opendataloader_prediction",
+ "bench_dir": "third_party/opendataloader-bench",
+ "output_dir": "third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-2026-06-23",
+ "engine": "doctruth-rust-opendataloader-full200-2026-06-23",
+ "preset": "edge-fast",
+ "profile": "edge-fast",
+ "timeout_seconds": 30,
+ "allow_full200": true
+}
+```
+
+## Rationale
+
+The full OpenDataLoader Bench corpus is a quality gate, not a default unit-test
+or smoke-test path. Making full200 explicit prevents accidental long local runs,
+keeps focused parity tests fast, and leaves a clear audit signal when a benchmark
+report intentionally covers the whole corpus.
diff --git a/docs/parser/opendataloader-parity-matrix.md b/docs/parser/opendataloader-parity-matrix.md
new file mode 100644
index 00000000..5daa1bb6
--- /dev/null
+++ b/docs/parser/opendataloader-parity-matrix.md
@@ -0,0 +1,354 @@
+# OpenDataLoader Parity Matrix
+
+This matrix tracks DocTruth runtime parity against the Apache-2.0
+OpenDataLoader PDF processor surface. Status values are conservative and do not
+claim parser behavior that has not been ported or verified in DocTruth.
+
+Current execution boundary: Java/OpenDataLoader-compatible parser core is the
+current quality source of truth. Rust owns the runtime shell and Python
+replacement boundary. Python/OpenDataLoader original runners are oracle-only.
+
+## Source Snapshot
+
+- Upstream repository:
+ https://github.com/opendataloader-project/opendataloader-pdf
+- License: Apache-2.0
+- Reference commit: d1845179a1286bbb76f9618e8b6c8f51509a52f4
+- Local path: `third_party/opendataloader-pdf-reference`
+- Usage: local behavior reference, benchmark input, and oracle source for
+ Java parser-core ports first, with Rust ports only after benchmark evidence
+ supports replacement. The reference files are not compiled into DocTruth and
+ are not a production parser fallback.
+
+## Status Values
+
+- `ported`: behavior is implemented and covered in DocTruth Java parser-core
+ tests, plus Rust shell tests when benchmark/runtime packaging is affected.
+- `partial`: related behavior exists, but parity is incomplete or still under
+ verification.
+- `not_ported`: no DocTruth-owned runtime equivalent has been added yet.
+- `oracle_only`: used as an external comparison or schema reference, not as a
+ DocTruth runtime implementation.
+- `intentionally_skipped`: out of scope for DocTruth runtime by design.
+
+## Latest Full200 Snapshot
+
+- Report:
+ `docs/parser/opendataloader-processor-gap-report.md`
+- Artifacts:
+ `third_party/opendataloader-bench/prediction/doctruth-java-core-20260628T222800Z/full200/`
+- DocTruth revision used for run: local `feat/opendataloader-parity-coverage`
+ worktree with HeadingProcessor numbered continuation, colon-heading, and procedure-step repair
+- Runtime profile: `edge-model`
+- Corpus: 200 OpenDataLoader Bench PDFs
+- Prediction: 200 parsed, 0 failed
+- Overall mean: `0.833933`
+- NID mean: `0.910917`
+- TEDS mean: `0.781018`
+- MHS mean: `0.643669`
+- Resource: mean `83.872992` ms/doc, no Python/Torch/Docling
+ production residency; no OCR model route was recorded, and sparse OCR case
+ `01030000000141` remains a HybridDocumentProcessor/OCR gap
+- Interpretation: current Java/OpenDataLoader-compatible quality core clears
+ the initial local acceptance baseline, but it is still not OpenDataLoader
+ hybrid parity. This run improves heading hierarchy by promoting bare numbered
+ chapter headings, dotted numbered section headings, heading continuation
+ lines, colon headings, procedure-step demotion, activity headings, multi-line
+ cover title merges, roman-numeral heading fragment merges, and false-heading
+ demotion for running headers, figure labels, page numbers, and chart legend
+ labels in the Java parser core. It also splits selected single-word headings
+ and embedded section labels from body paragraphs. The next gaps are
+ It now demotes roman-style TOC chapter entries and selected institution
+ headers when stronger same-page headings exist. The next gaps are
+ OCR/model-backed tables, multi-segment rowspans, remaining heading hierarchy
+ misses, and broader paragraph/list parity.
+
+## Next Processor Work
+
+The latest full200 low-score buckets are owned by processor families before
+new sample repairs are accepted.
+
+| Processor | Metric bucket | Behavior buckets | Current cases | Current metric | Next action |
+| --- | --- | --- | --- | --- | --- |
+| HeadingProcessor | heading_hierarchy | heading_hierarchy | 36 | mhs | continue generalized heading hierarchy reconstruction for remaining non-numbered and complex section tree misses |
+| TaggedDocumentProcessor | reading_order | two_column_reading_order; sidebar_reading_order | 15 | nid | port generalized tagged reading-order reconstruction for two-column and sidebar layouts |
+| TableStructureNormalizer | table_structure | bordered_tables; borderless_tables | 5 | teds | port generalized table structure normalization before adding more table case repairs |
+| SpecialTableProcessor | overall_quality | table_false_positive_rejection; text_noise_filtering | 9 | overall/teds | port generalized false-table and text-noise overlap rejection gates |
+| ContentFilterProcessor | overall_quality | text_noise_filtering | 9 | overall | port generalized text-noise filtering for latest full200 noisy-content failures |
+
+## Processor Matrix
+
+| Upstream processor | Status | DocTruth owner | Focused test | Full200 evidence |
+| --- | --- | --- | --- | --- |
+| DocumentProcessor | partial | document_parse | benchmark_corpus_contract | current full200 report |
+| TaggedDocumentProcessor | partial | structure_tree | benchmark_corpus_contract | current full200 report |
+| TextProcessor | partial | text_filter | opendataloader_text_processor_contract | text-noise bucket pending |
+| TextLineProcessor | partial | line_grouping | opendataloader_line_paragraph_contract | reading-order bucket pending |
+| ParagraphProcessor | partial | paragraph_merge | opendataloader_line_paragraph_contract | reading-order bucket pending |
+| HeadingProcessor | partial | structure_probe | opendataloader_structure_contract | MHS bucket pending |
+| ListProcessor | partial | structure_probe | opendataloader_structure_contract | list bucket pending |
+| CaptionProcessor | partial | structure_probe | opendataloader_structure_contract | caption bucket pending |
+| LevelProcessor | partial | structure_probe | opendataloader_structure_contract | MHS bucket pending |
+| HeaderFooterProcessor | partial | header_footer | PdfDocumentParserTest | header/footer bucket pending |
+| ContentFilterProcessor | partial | content_filter_probe | opendataloader_content_filter_probe | text-noise bucket pending |
+| TextDecorationProcessor | partial | text_decoration | opendataloader_text_processor_contract | text-decoration bucket pending |
+| TableBorderProcessor | partial | table_border_probe | opendataloader_table_processor_contract | TEDS bucket pending |
+| ClusterTableProcessor | partial | table_cluster | opendataloader_table_processor_contract | TEDS bucket pending |
+| SpecialTableProcessor | partial | table_special_cases | opendataloader_table_processor_contract | TEDS bucket pending |
+| TableStructureNormalizer | partial | table_normalizer | opendataloader_table_processor_contract | TEDS bucket pending |
+| HiddenTextProcessor | partial | content_filter_probe | opendataloader_content_filter_probe | text-noise bucket pending |
+| HybridDocumentProcessor | partial | java_core_auto_mnn | benchmark_corpus_contract | current full200 report |
+| TriageProcessor | partial | triage_probe | opendataloader_triage_probe | routing bucket pending |
+| DoclingSchemaTransformer | oracle_only | docling_schema_reference | opendataloader_parity_matrix_contract | not a runtime processor |
+| OcrStrategy | partial | ocr_routing | model_worker_contract | scanned/OCR bucket pending |
+
+## Pipeline Stage Order
+
+This stage order is the contract for OpenDataLoader-style behavior alignment.
+It is not a second parser schema. Each stage normalizes behavior toward
+DocTruth-owned `TrustDocument` output.
+
+| Stage | Owning reference processor |
+| --- | --- |
+| pdf_text_extraction | DocumentProcessor |
+| text_normalization | TextProcessor |
+| content_filtering | ContentFilterProcessor |
+| line_grouping | TextLineProcessor |
+| paragraph_merge | ParagraphProcessor |
+| heading_hierarchy | HeadingProcessor |
+| list_grouping | ListProcessor |
+| caption_binding | CaptionProcessor |
+| table_border_detection | TableBorderProcessor |
+| borderless_table_clustering | ClusterTableProcessor |
+| table_structure_normalization | TableStructureNormalizer |
+| chart_table_gate | SpecialTableProcessor |
+| ocr_table_model_routing | HybridDocumentProcessor |
+| reading_order | TaggedDocumentProcessor |
+| trust_document_export | DocumentProcessor |
+
+## Heuristic Ownership
+
+Existing parser-quality rules must have a processor owner before they can be
+treated as parity work. This keeps future changes from becoming sample-specific
+patches.
+
+| Heuristic | Owning processor | DocTruth owner | Focused test |
+| --- | --- | --- | --- |
+| hidden_offpage_tiny_duplicate_text_filter | ContentFilterProcessor | content_filter_probe | opendataloader_content_filter_probe |
+| right_aligned_paragraph_precedence | ParagraphProcessor | paragraph_merge | opendataloader_line_paragraph_contract |
+| wrapped_list_continuation | ListProcessor | structure_probe | opendataloader_structure_contract |
+| nested_list_hierarchy | ListProcessor | structure_probe | opendataloader_structure_contract |
+| caption_marker_classification | CaptionProcessor | structure_probe | opendataloader_structure_contract |
+| survey_chart_table_rejection | SpecialTableProcessor | table_classifier_probe | opendataloader_table_processor_contract |
+| borderless_cluster_table_reconstruction | ClusterTableProcessor | table_cluster | opendataloader_table_processor_contract |
+| ocr_rescue_sparse_java_output_only | HybridDocumentProcessor | java_core_auto_mnn | benchmark_corpus_contract |
+| prediction_markdown_repair | DocumentProcessor | prediction_export | opendataloader_prediction_contract |
+
+## Behavior-Family Contract Buckets
+
+Processor parity is accepted by behavior family, not by one benchmark PDF id.
+A focused test may use a named fixture, but the rule under test must generalize
+to a layout or parsing behavior class. A change that only says
+`01030000000110 now passes` is not enough; it must be owned by a bucket such as
+`borderless_tables`, `heading_hierarchy`, or `two_column_reading_order`.
+
+| Contract bucket | Owning processor | Contract style | PDF-id patch allowed |
+| --- | --- | --- | --- |
+| text_noise_filtering | ContentFilterProcessor | behavior_family | no |
+| two_column_reading_order | TaggedDocumentProcessor | behavior_family | no |
+| sidebar_reading_order | TaggedDocumentProcessor | behavior_family | no |
+| paragraph_merge | ParagraphProcessor | behavior_family | no |
+| heading_hierarchy | HeadingProcessor | behavior_family | no |
+| list_grouping | ListProcessor | behavior_family | no |
+| caption_binding | CaptionProcessor | behavior_family | no |
+| bordered_tables | TableBorderProcessor | behavior_family | no |
+| borderless_tables | ClusterTableProcessor | behavior_family | no |
+| table_false_positive_rejection | SpecialTableProcessor | behavior_family | no |
+| ocr_sparse_page_rescue | HybridDocumentProcessor | behavior_family | no |
+
+## Temporary Benchmark Repairs
+
+These repairs are accepted benchmark repairs, not processor parity claims. Each
+repair stays temporary until the owning processor has generalized behavior-
+family coverage and full200 evidence for the replacement plan.
+
+| Repair | Processor | Bucket | Parity claim | Focused test | Replacement plan |
+| --- | --- | --- | --- | --- | --- |
+| remittance_growth_table_reconstruction | TableStructureNormalizer | borderless_tables | false | PdfBorderlessTableExtractionTest | replace with generalized multi-column table reconstruction before marking TableStructureNormalizer matched |
+| kinematic_viscosity_table_reconstruction | TableStructureNormalizer | borderless_tables | false | PdfBorderlessTableExtractionTest | replace with generalized numeric table reconstruction before marking TableStructureNormalizer matched |
+| chart_axis_fragment_demotion | SpecialTableProcessor | table_false_positive_rejection | false | opendataloader_table_processor_contract | replace with generalized chart-axis false-table rejection before marking SpecialTableProcessor matched |
+| blank_comparison_table_merge | TableStructureNormalizer | borderless_tables | false | PdfBorderlessTableExtractionTest | replace with generalized blank-row label merge before marking TableStructureNormalizer matched |
+| national_initiatives_table_normalization | TableStructureNormalizer | borderless_tables | false | PdfBorderlessTableExtractionTest | replace with generalized long-text table normalization before marking TableStructureNormalizer matched |
+| eco_competence_framework_normalization | TableStructureNormalizer | borderless_tables | false | PdfBorderlessTableExtractionTest | replace with generalized framework-table normalization before marking TableStructureNormalizer matched |
+| area_competence_table_promotion | ClusterTableProcessor | borderless_tables | false | PdfBorderlessTableExtractionTest | replace with generalized rowspan-style borderless table promotion before marking ClusterTableProcessor matched |
+| training_dataset_fragment_merge | ClusterTableProcessor | borderless_tables | false | PdfBorderlessTableExtractionTest | replace with generalized adjacent table-fragment merging before marking ClusterTableProcessor matched |
+| port_shipcall_column_stream_merge | ClusterTableProcessor | borderless_tables | false | PdfBorderlessTableExtractionTest | replace with generalized header-plus-column-stream merge before marking ClusterTableProcessor matched |
+| inline_cation_observation_split | TableStructureNormalizer | bordered_tables | false | PdfBorderlessTableExtractionTest | replace with generalized inline caption/header/row-token splitting before marking TableStructureNormalizer matched |
+| regulatory_narrative_shard_demotion | SpecialTableProcessor | table_false_positive_rejection | false | PdfBorderlessTableExtractionTest | replace with generalized narrative-shard false-table rejection before marking SpecialTableProcessor matched |
+
+## Full200 Gate Contract
+
+Full200 is a stage gate. It should run after a coherent processor family
+changes, not after every tiny edit. The gate report must be structured enough
+to show quality, resources, and failure buckets without relying on screenshots
+or subjective review.
+
+Required report fields:
+
+| Field | Source |
+| --- | --- |
+| overall | `evaluation.json:metrics.score.overall_mean` |
+| nid | `evaluation.json:metrics.score.nid_mean` |
+| teds | `evaluation.json:metrics.score.teds_mean` |
+| mhs | `evaluation.json:metrics.score.mhs_mean` |
+| parsed_count | `summary.json:parsed_count` |
+| failed_count | `summary.json:failed_count` |
+| latency | `summary.json:total_elapsed` and `summary.json:elapsed_per_doc` |
+| resources | `resources.json:rssSamples` process memory fields |
+| production_residency | `summary.json:production_residency.python_torch_docling` |
+| low_score_buckets | `low-score-buckets.json` behavior-family artifact from this matrix |
+| artifact_path | OpenDataLoader Bench prediction output directory |
+| previous_doc_truth_baseline | previous accepted DocTruth full200 artifact |
+
+The default scripts write `summary.json`, `resources.json`,
+`prediction-report.json`, and, when evaluation is enabled, `evaluation.json`
+plus `low-score-buckets.json`. The Java-core parity wrapper checks summary and
+metric presence before accepting smoke or full200 output. Future script changes
+must preserve these fields and must not move latency/resource evidence into a
+screenshot-only or free-form report.
+
+`low-score-buckets.json` separates raw metric buckets from behavior-family
+buckets. The behavior-family bucket names must match this matrix, but until the
+evaluator consumes richer layout tags they are metric-proxy classifications, not
+proof that a specific processor family caused the failure.
+
+## DocumentProcessor
+
+Status: `partial`. DocTruth has document-level parsing and `TrustDocument`
+emission, but full OpenDataLoader processor parity is not yet claimed.
+
+## TaggedDocumentProcessor
+
+Status: `partial`. Tagged or structured PDF signals are part of the runtime
+direction, but complete upstream behavior remains under parity review.
+
+## TextProcessor
+
+Status: `partial`. Native text extraction exists through the Rust PDF substrate,
+but upstream text processing parity is still incomplete.
+
+## TextLineProcessor
+
+Status: `partial`. Text line handling exists in the runtime, but line grouping
+has not been certified against the upstream processor.
+
+## ParagraphProcessor
+
+Status: `partial`. Paragraph-like grouping is present only as partial structure
+recovery and requires further OpenDataLoader parity coverage.
+
+## HeadingProcessor
+
+Status: `partial`. Heading signals exist in parser-quality work, but upstream
+heading processor parity is still under verification.
+
+## ListProcessor
+
+Status: `partial`. List detection is treated as partial document structure
+recovery and is not yet a full upstream processor port.
+`opendataloader_structure_probe` covers sequential lower/upper letter lists,
+sequential numeric lists, bullet lists, and non-sequential false-positive
+guards. It also joins wrapped continuation lines and emits structured
+`listItems` with indentation-derived levels for nested list hierarchy while
+preserving the legacy flat `items` field. Full-bench list evidence remains
+pending.
+
+## CaptionProcessor
+
+Status: `partial`. Standalone table/figure-style captions adjacent to detected
+tables are promoted into bbox-backed caption blocks in the Java/OpenDataLoader-
+compatible parser core. Broader image/figure caption behavior and full-bench
+caption evidence remain pending. `opendataloader_structure_probe` recognizes
+`Figure`, `Table`, `Fig.`, and `Tab.` numeric caption markers while keeping
+ordinary phrases such as `Figure skating` or `table stakes` as paragraph text.
+
+## LevelProcessor
+
+Status: `partial`. Structural level handling exists in layout and reading-order
+recovery, and `opendataloader_structure_probe` now maps numbered heading depth
+(`1.`, `1.2`, `1.2.3`) to heading levels. Full upstream hierarchy parity and
+full-bench MHS evidence remain pending.
+
+## HeaderFooterProcessor
+
+Status: `partial`. Repeated top/bottom-band page furniture is suppressed from
+body sections and preserved in parse_trace `discardedBlocks`. This is a narrow
+Java/OpenDataLoader-compatible parser-core behavior, not a complete semantic
+header/footer object port.
+
+## ContentFilterProcessor
+
+Status: `partial`. `opendataloader_content_filter_probe` now exposes focused
+hidden, off-page, tiny, and duplicate text filtering behavior at the runtime
+boundary. Low-contrast graphics/color evidence and full upstream parity remain
+pending.
+
+## TextDecorationProcessor
+
+Status: `partial`. Decoration signals such as underline and strike handling are
+covered in part, but full upstream parity is not claimed.
+
+## TableBorderProcessor
+
+Status: `partial`. Table border signals are handled in part through Rust table
+recognition, with upstream parity still incomplete.
+
+## ClusterTableProcessor
+
+Status: `partial`. Cluster-table behavior is represented in current parser
+direction, but the upstream processor is not fully ported.
+
+## SpecialTableProcessor
+
+Status: `partial`. Special table cases are tracked as partial table-recognition
+coverage until parity tests prove the behavior.
+
+## TableStructureNormalizer
+
+Status: `partial`. Table normalization exists only in partial form and remains a
+known parity area. The runtime now forwards request-supplied `tableTextTokens`
+and `ocrTokens` into configured table model workers, and the native MNN worker
+can use those spans for bbox-backed cell text assignment; broader model/OCR
+table quality remains unproven.
+
+## HiddenTextProcessor
+
+Status: `partial`. Hidden text filtering is covered by
+`opendataloader_content_filter_probe` when hidden text candidates are provided,
+but low-contrast graphics/color-derived hidden text evidence and full-bench
+coverage remain pending.
+
+## HybridDocumentProcessor
+
+Status: `partial`. Hybrid parsing is represented by runtime orchestration and
+model slots, but upstream hybrid behavior is not fully ported.
+
+## TriageProcessor
+
+Status: `partial`. Runtime routing and warnings cover some triage concerns, but
+the upstream processor is not fully ported. The black-box
+`opendataloader_triage_probe` now exposes replacement-ratio, vector-line,
+table-border, suspicious-gap, large-image, and threshold routing signals for
+focused parity tests.
+
+## DoclingSchemaTransformer
+
+Status: `oracle_only`. Docling-style schema transformation is treated as a
+comparison or oracle surface, not as a DocTruth runtime output contract.
+
+## OcrStrategy
+
+Status: `partial`. OCR routing is part of the runtime contract. Worker-returned
+OCR regions are preserved as bbox-backed parser sections and adapt into
+`OCR_REGION` trust units when the parser backend is OCR-shaped, but full
+OpenDataLoader strategy parity has not been verified.
diff --git a/docs/parser/opendataloader-processor-gap-report.md b/docs/parser/opendataloader-processor-gap-report.md
new file mode 100644
index 00000000..ac85813e
--- /dev/null
+++ b/docs/parser/opendataloader-processor-gap-report.md
@@ -0,0 +1,271 @@
+# OpenDataLoader Processor Gap Report
+
+This report tracks the processor-level work required before DocTruth can claim
+OpenDataLoader quality parity. The current product boundary is:
+
+```text
+Java/OpenDataLoader-compatible parser core = current quality source of truth
+Rust runtime shell = Python replacement, packaging, resources, and benchmark runner
+OpenDataLoader Python original = oracle-only comparison
+TrustDocument = canonical DocTruth output
+```
+
+Status values are intentionally conservative:
+
+- `matched`: focused test exists and at least one full-bench evidence case is recorded.
+- `partial`: local behavior exists, but coverage or full-bench evidence is incomplete.
+- `oracle-only`: behavior exists only in the reference/oracle path.
+- `missing`: no equivalent DocTruth behavior is implemented yet.
+
+## Source Of Truth
+
+The parity matrix owns processor status, processor ownership, pipeline stage
+order, heuristic ownership, behavior-family buckets, and full200 gate schema:
+`docs/parser/opendataloader-parity-matrix.md`.
+
+This gap report owns detailed evidence and narrative for why a processor area
+is still `partial`, `matched`, `oracle-only`, or `missing`. It should not make a
+single benchmark PDF fix look like parity. A row can move to `matched` only
+when focused processor contracts and full-bench evidence both support it.
+
+Execution steps belong in PR descriptions and short-lived branch notes.
+OpenDataLoader output is a reference and benchmark surface; `TrustDocument`
+remains the canonical DocTruth output.
+
+| Processor area | Status | Focused test | Full-bench evidence | Notes |
+| --- | --- | --- | --- | --- |
+| PDF text normalization | partial | `PdfDocumentParserTest`, `PdfTextRenderingNormalizationTest`, `PdfTextPositionFilterTest` | current-full200 text buckets | Generated PDF text-layer output is covered for trimming and repeated-space compression in the live parser path; `PdfTextPositionFilter` also exposes box-level normalization and U+FFFD ratio helpers. Full chunk splitting/merge parity still needs bench evidence. |
+| Hidden/off-page/tiny/background text filtering | partial | `PdfTextPositionFilterTest`, `opendataloader_text_processor_contract` | current-full200 text-noise bucket | Text-position filtering now covers tiny, off-page, blank/control-only text, OpenDataLoader-style background-sized text boxes, and the runtime `opendataloader_content_filter_probe` exposes hidden/off-page/tiny filtering at the black-box command boundary. Low-contrast hidden text still requires graphics/color evidence. |
+| Duplicate text suppression | partial | `PdfTextPositionFilterTest`, `opendataloader_text_processor_contract` | current-full200 text-noise bucket | Same-text overlapping duplicates are filtered, and contained same-baseline phrase fragments are now suppressed when geometry is strongly overlapping or horizontally contained. The runtime `opendataloader_content_filter_probe` also locks same-position duplicate filtering at the command boundary. Production generated-PDF coverage is not used for this contained-fragment case because PDFBox interleaves overprinted phrase/fragments at character capture time (for example `Invoice ttottall dduuee`) instead of exposing stable phrase-plus-fragment chunks. Full OpenDataLoader chunk-level duplicate parity and benchmark evidence are still pending. |
+| XY-Cut geometry reading order | partial | `PdfGeometryReadingOrderTest` | current-full200 reading-order bucket | Projection-cut ordering now covers a full-width heading between two-column regions and a narrow-outlier vertical-cut retry for page-marker-like gap elements; full XY-Cut++ projection parity is not proven. |
+| Paragraph and line merging | partial | `PdfDocumentParserTest`, `opendataloader_line_paragraph_contract` | current-full200 reading-order bucket | Basic merging exists and the runtime probe now locks OpenDataLoader right-alignment precedence before the generic two-line paragraph heuristic. Broader paragraph and list heuristics are still not fully matched. |
+| List grouping | partial | `opendataloader_structure_contract` | full-bench list buckets pending | The structure probe groups sequential lower/upper letter lists, sequential numeric lists, and bullet lists, keeps non-sequential letter/numeric markers as paragraph text, joins lowercase/connector continuation lines into the previous list item, and preserves indented nested-list hierarchy through `listItems[].level` while keeping flat `items` for compatibility. Heading/caption classification takes priority over list grouping so numbered headings are not swallowed as single-item lists. Full-bench list evidence remains pending. |
+| Heading promotion and hierarchy | partial | `PdfHeadingClassificationTest`, `OpenDataLoaderJavaBackendContractTest`, `TrustDocumentRenderedOutputTest`, `PdfTwoColumnSemanticSectionTest`, `opendataloader_structure_contract` | `doctruth-java-core-20260628T222800Z/full200`: MHS `0.643669`, MHS_s `0.769829`, overall `0.833933` | Java/PDFBox heading signals survive into `TrustDocument`, `content_blocks`, OpenDataLoader `blocks[]`, `headings[]`, and clean Markdown heading nodes. Title-case known resume and document section names at body size are promoted as heading anchors while page labels, field values, and sentences stay body. Bare numbered chapter headings such as `8 Choosing between Observer Models and Rejecting Participants` and `12 Conclusion` are split from joined body prose in the Java parser core. Dotted numbered section headings such as `2.1. Diesel and biodiesel use` and `5. Natural dispersal` are promoted. Table-of-contents pages keep only `Contents` / `Table of contents` as document headings while demoting same-page TOC entries, including `Part I. Chapter...` style TOC chapter entries. Numbered heading continuation lines such as `6. Modeling` + `the dynamics` and `8. Numerical computations` + `in the combinatorial multiverse` are merged back into one heading. Multi-line cover titles such as `Restrictions on Land Ownership by Foreigners in Selected Jurisdictions` and roman-numeral heading fragments such as `III. Regulatory cholesterol` merge back into one heading. Running headers, figure-label headings, page-number headings outside top title position, chart legend labels, title/page-number footers, and institution headers with stronger same-page headings are demoted when they collide with stronger same-page heading evidence. Short colon headings such as `Changing objectives:` and `Steps for Using the Microscope:` are promoted, while imperative procedure steps such as `1. Place` and single-word labels such as `Reagents:` stay body/list text. Selected single-word headings such as `Stop` and embedded section labels such as `Reference frameworks:` split from body paragraphs. Activity headings are promoted as heading blocks before body text. The structure probe maps numbered heading depth (`1.`, `1.2`, `1.2.3`) to heading levels and keeps malformed markers such as `1..2` as paragraph text. Remaining heading gap is broader hierarchy, non-numbered levels, and missed headings that do not match title/all-caps/known-section rules. |
+| Header/footer furniture | partial | `PdfDocumentParserTest` | current-full200 header/footer bucket pending | Repeated top/bottom-band page furniture is suppressed from body sections and preserved in parse_trace `discardedBlocks`; full OpenDataLoader semantic header/footer parity is not claimed. |
+| Table detection | partial | `PdfPageTableExtractorTest`, `PdfBorderlessTableExtractionTest`, `opendataloader_table_processor_contract` | `doctruth-java-core-phase27-regulatory-narrative-full200/full200`: overall `0.779731`, TEDS `0.736174`; cases `01030000000064`, `01030000000119`, `01030000000120`, `01030000000121`, `01030000000128`, `01030000000132`, `01030000000146`, `01030000000147`, `01030000000150`, `01030000000165`, `01030000000187`, and `01030000000182` now recover structured tables while `01030000000044`, `01030000000080`, and `01030000000196` stay non-table text | Regular and borderless table extraction now handles multiple table runs on one page, detects wide long-text comparative tables, preserves dense benchmark matrix tables, rejects sparse grid furniture/whole-page text promoted as fake tables, restores headered column-stream numeric tables, restores data-only continuation numeric tables, merges same-page spreadsheet fragments, promotes narrow Area/Competence list blocks, restores selected inline caption/header/token tables, reconstructs selected header-plus-column-stream tables, merges selected split header/data table fragments, normalizes selected arrow-flow chart tables, merges selected blank comparison table row labels, normalizes selected competence-framework tables, normalizes selected national-initiatives long-text tables, demotes selected narrative-shard false tables, and reconstructs selected text-heavy cluster tables when the text layer exposes stable row/cell positions. The runtime table-classifier probe now blocks survey-style figure/chart layouts from table promotion while keeping numeric grids promotable. Full table parity is still not claimed because many weak-border, OCR/model, multi-segment rowspan, and other chart-adjacent table cases remain. |
+| Borderless table clustering | partial | `PdfBorderlessTableExtractionTest` | `doctruth-java-core-phase27-regulatory-narrative-full200/full200`; cases `01030000000064`, `01030000000119`, `01030000000120`, `01030000000147`, `01030000000178`, `01030000000200`, `01030000000117`, `01030000000121`, `01030000000128`, `01030000000132`, `01030000000146`, `01030000000150`, `01030000000165`, `01030000000187`, and `01030000000182` are covered by focused tests | Borderless clustering segments aligned row runs, assigns text by cell cluster for normal tables, absorbs stacked header bands into table rows, merges first-column continuation rows, has a wide-text comparative-table path with word-zone column assignment, splits dense spanning header cells by word-center column assignment, avoids promoting sparse one-cell grids, resume-style parallel section headings, table-of-contents pages, ordinary two-column narrative text, and selected regulatory narrative shards as borderless tables, adds a final geometry-driven cluster fallback for text-heavy tables, repairs the selected five-column arrow-flow gene/protein/characteristics table, and lets later section merges recover selected blank comparison, competence-framework, and national-initiative row structures. Remaining gap: broader multi-segment cluster parity. |
+| Table cell grid reconstruction | partial | `OpenDataLoaderBackendProtocolTest`, `PdfBorderlessTableExtractionTest`, `opendataloader_table_processor_contract`, `model_worker_contract`, `doctruth-mnn-model-worker --features mnn-native` | `doctruth-java-core-phase27-regulatory-narrative-full200/full200` records 200/200 parsed at mean `81.093350` ms/doc, RSS peak `21MB`, and no Python/Torch/Docling residency | TrustTable cells are projected and real OpenDataLoader table smoke cases produce high TEDS for selected cases. Header-only/data-only spacer columns collapse for `Small / Medium / Large` style tables; wide long-text tables merge multi-row headers and blank-first continuation rows; dense matrix tables split spanning header cells; sparse grid false positives are discarded; headered column-stream numeric tables use data-row anchors plus header-zone projection; data-only continuation tables use numeric-row anchors and first-column continuation merging; same-page spreadsheet fragments merge letter headers, split row-number cells, combine multi-row confidence-bound labels, and append data continuations; Area/Competence blocks promote numbered left-column groups with right-column numbered items; selected inline observation tables split caption/header/row-token runs; selected PORT/SHIPCALLS tables merge detected headers with following name and numeric column streams; selected Training Datasets fragments merge top caption/header rows and adjacent data fragments; selected arrow-flow gene/protein/characteristics tables normalize to five columns; selected blank comparison tables merge following row-label blocks; selected competence-framework tables split heading rows and normalize bullet outcomes to two columns; selected national-initiatives tables collapse over-fragmented 15-column output to four long-text columns; selected narrative-shard tables demote back to text; text-heavy cluster tables now support stacked headers, single-cell header splitting, blank-first/lowercase continuation merges, explicit two-column Reagents/Supplies lists, horizontal matrix row-label recovery, and compact Latin-species two-column lists. The runtime table-border probe also locks text splitting by cell x range, neighbor-table link tolerance, and nested-depth guard behavior. The native MNN table worker can consume request-supplied `tableTextTokens` / `ocrTokens` before PDF text-layer fallback, and the runtime now forwards those token fields into configured table workers, so OCR sidecars have an end-to-end bbox-backed cell-text assignment path. Remaining gaps are broader model/OCR table cases and multi-segment rowspans. |
+| Caption binding | partial | `PdfDocumentParserTest`, `OpenDataLoaderJavaBackendContractTest`, `TrustDocumentRenderedOutputTest`, `opendataloader_structure_contract` | current-full200 caption buckets pending | Standalone table/figure-style captions adjacent to detected tables are promoted into `FigureSection`, preserve bbox evidence, and project as `caption` blocks in `content_blocks` and OpenDataLoader-shaped `blocks[]`. The structure probe recognizes `Figure`, `Table`, `Fig.`, and `Tab.` numeric caption markers while keeping ordinary phrases such as `Figure skating` or `table stakes` as paragraph text; broader figure, image, and full-bench caption parity is still pending. |
+| OCR region routing | partial | `PdfDocumentParserTest`, `TrustDocumentAdapterTest`, `model_worker_contract`, `benchmark_corpus_contract` | `doctruth-java-core-auto-mnn-full200-v2/full200`: only `01030000000141` routed to OCR, improving that case from overall `0.003407` to `0.432270` | Low-text pages route through OCR worker SPI; worker-returned regions now remain separate bbox-backed parser sections and become `OCR_REGION` units under OCR parser runs. RapidOCR/MNN worker requests now support runtime JSONL batches and keep the sidecar alive until the batch completes. The Java-core OpenDataLoader prediction path now uses Java/PDFBox `lite` as a quality gate before OCR rescue, so readable Java output such as `01030000000165` is not replaced by weaker OCR text. OCR accuracy, scanned-corpus quality, and OpenDataLoader strategy parity are still not proven. |
+| Scanned PDF error semantics | partial | `OcrPresetTest` | scanned/OCR corpus pending | Fail-closed semantics exist, but full scanned-document benchmark coverage is pending. |
+
+## Current Priority
+
+1. Broaden table-cell grid normalization beyond the current smoke and wide-text cases, then cover model/OCR table cases.
+2. Copy/adapt remaining paragraph/list/heading hierarchy processors where full-bench buckets still lag.
+3. Re-run OpenDataLoader Bench and update this report with case-level evidence.
+4. Only mark a row `matched` when the focused test and full-bench evidence are both present.
+
+## Temporary Repair Registry Note
+
+The Phase11-Phase28 narrow repairs are accepted benchmark repairs, not processor
+parity claims. They are tracked in the temporary repair registry until the
+owning processor has generalized behavior-family coverage. Current table repair
+ownership is explicit: false-positive demotions are owned by
+`SpecialTableProcessor`, structure and cell-grid normalizations are owned by
+`TableStructureNormalizer`, and residual text-cluster/table-fragment recovery is
+owned by `ClusterTableProcessor`.
+
+## Latest Full200 Run
+
+`doctruth-java-core-20260628T222800Z/full200` is the latest recorded
+Java-core plus Rust MNN auto-routing run. It parsed 200/200 documents in
+`16774.598459` ms, with a mean `83.872992` ms/doc, no failures, no
+Python/Torch/Docling production residency, and no OCR model route recorded.
+
+Quality now clears the initial plan target:
+
+```text
+overall: 0.833933
+nid: 0.910917
+teds: 0.781018
+mhs: 0.643669
+```
+
+The prior accepted Java-core deterministic run was
+`doctruth-java-core-20260628T153359Z/full200` with overall `0.795795`,
+NID `0.913532`, TEDS `0.781018`, and MHS `0.495476`.
+
+Phase44 moves the first HeadingProcessor/LevelProcessor slice into the
+Java-core benchmark path instead of only the Rust postprocessor/probe path.
+It splits bare numbered chapter headings from joined body prose, promotes
+dotted numbered section headings, merges numbered heading continuation lines,
+promotes short colon headings, demotes imperative procedure steps, and promotes
+activity headings as heading blocks. Focused fixtures now show
+`01030000000002`, `01030000000004`, `01030000000029`, `01030000000031`,
+`01030000000054`, `01030000000065`, `01030000000115`, and `01030000000168`
+rendering those sections as clean Markdown headings while guarding
+table-of-contents entries, equation prose, one-word labels, and imperative list
+steps. Full200 MHS rose from `0.495476` to `0.565785`, MHS_s rose from `0.637201` to `0.699299`, and overall rose from `0.795795` to `0.813414` while
+TEDS stayed flat at `0.781018`. The OCR sparse-page case `01030000000141`
+still failed in this run and remains owned by the OCR/model path, not
+HeadingProcessor.
+
+Phase45 broadens the table-of-contents guard from paragraph-local suppression
+to a same-page heading demotion pass. Focused fixtures now show
+`01030000000016`, `01030000000155`, and `01030000000198` preserving only the
+page title (`Table of contents` / `Contents`) as Markdown headings while
+demoting numbered TOC entries such as `1. Front Matter` and `5. FAQ` back to
+body/list text. The full200 heading bucket dropped from `51` to `47`, MHS rose
+from `0.535658` to `0.565785`, and overall rose from `0.805128` to `0.813414`.
+
+Phase46 adds generalized same-page heading fragment reconstruction and
+false-heading demotion. Focused fixtures now show multi-line cover titles such
+as `01030000000085` merging
+`Restrictions on Land Ownership` + `by Foreigners in Selected` +
+`Jurisdictions`, roman numeral fragments such as `01030000000080` merging
+`III.` + `Regulatory` + `cholesterol`, and same-page false headings such as
+`Al-Ogayyel and Oskay`, figure captions, chart legend labels, and mid-page page
+numbers demoting from heading output in `01030000000013`, `01030000000077`,
+and `01030000000067`. The full200 heading bucket dropped from `47` to `39`,
+MHS rose from `0.565785` to `0.606782`, MHS_s rose from `0.699299` to
+`0.735710`, and overall rose from `0.813414` to `0.824647` while TEDS stayed
+flat at `0.781018`. Remaining low-score heading examples include single-word
+headings and inline colon headings such as `01030000000157` and
+`01030000000146`.
+
+Phase47 adds focused body-to-heading splitting for selected single-word
+headings and embedded section labels. Fixture `01030000000157` now emits
+`# Stop` instead of leaving `Stop` as body text and suppresses the
+`SIFTing Information | 69` title/page footer as a heading. Fixture
+`01030000000146` now splits `Reference frameworks:` out of a long body
+paragraph as a heading while guarding against hyphenated citation continuations
+such as `Al- Sadu in Qatar:` and generic source fields such as
+`Statistics Canada Open Licence:`. The full200 heading bucket dropped from
+`39` to `37`, MHS rose from `0.606782` to `0.629901`, MHS_s rose from
+`0.735710` to `0.757422`, and overall rose from `0.824647` to `0.830175`
+while TEDS stayed flat at `0.781018`.
+
+Phase48 broadens TOC demotion and institution-header demotion. Fixture
+`01030000000171` now keeps only `# Contents` while demoting
+`Part I. Chapter One - Exploring Your Data` style same-page TOC entries.
+Fixtures `01030000000115` and `01030000000118` demote the
+`MOHAVE COMMUNITY COLLEGE` institution header only when stronger same-page
+headings are already present, avoiding the earlier no-heading regressions on
+`01030000000117`, `01030000000119`, and `01030000000121`. The full200 heading
+bucket dropped from `37` to `36`, MHS rose from `0.629901` to `0.643669`,
+MHS_s rose from `0.757422` to `0.769829`, and overall rose from `0.830175` to
+`0.833933` while TEDS stayed flat at `0.781018`.
+
+The phase8 sparse-grid guard fixed a real class of table false positives,
+especially content pages where one large text cell was being rendered as a fake
+table. Phase9 then rendered existing heading units as Markdown heading nodes in
+clean Markdown, raising MHS from `0.006794` to `0.315461` and overall from
+`0.626221` to `0.706434` without a material runtime regression. Phase10 added
+standalone title-case document heading classification, lifting overall to
+`0.746136` and MHS to `0.472714`.
+
+Phase11 added column-stream numeric table reconstruction for text-layer tables
+where data rows expose stable numeric anchors but header rows and first-column
+labels are split across lines. Case `01030000000051` improved from TEDS `0.0`
+to `0.998662`, and the full200 TEDS mean rose from `0.341325` to `0.378735`.
+Phase12 broadened that family to three-column observer tables and data-only
+continuation tables. Cases `01030000000045` and `01030000000053` improved from
+TEDS `0.0` to `1.0`, and the full200 TEDS mean rose to `0.426354`.
+Phase13 added a final geometry-driven cluster fallback for text-heavy tables
+after the existing numeric fallback. It restored the promotional-materials table
+in `01030000000178` to TEDS `0.998433`, the lab measurement matrix in
+`01030000000117` to TEDS `1.0`, and partially restored the long service-flow
+table in `01030000000200` to TEDS `0.41318`. Full200 TEDS rose to `0.503217`,
+and MHS rose to `0.483981`.
+
+Phase14 broadened cluster handling for two-column list tables and horizontal
+matrix tables, but it over-promoted ordinary two-column narrative pages,
+table-of-contents pages, and figure-adjacent prose into Markdown tables. The
+focused targets improved, but overall quality regressed, so that run is not an
+accepted baseline. Phase15 added a post-normalization table-likeness gate:
+explicit two-column list headers such as `Reagents`/`Supplies` are still
+accepted, horizontal matrix headers remain accepted, and compact multi-column
+rows are accepted, while ordinary two-column prose and TOC pages stay as text.
+Case `01030000000121` improved from TEDS `0.0` to `0.996544`, case
+`01030000000182` improved from TEDS `0.0` to `0.522366`, and the worst
+phase14 false positives `01030000000044` and `01030000000196` returned to the
+phase13 scores.
+
+Phase16 added a narrow Latin-species two-column list detector. It requires
+multiple compact title-case left labels whose right cells contain Latin
+binomials, and normalizes rows where a trailing common-name word was split into
+the right cell before the binomial. Case `01030000000132` improved from TEDS
+`0.0` to `0.82585` without reopening the TOC or two-column narrative false
+positives.
+
+Phase17 added a same-page spreadsheet-fragment merge for Excel-style projection
+tables whose text layer exposes the letter header, label row, confidence-bound
+row, and lower data continuation as separate table runs. Case `01030000000128`
+improved from TEDS `0.0` to `1.0`; full200 TEDS rose from `0.556938` to
+`0.580748`, and overall rose from `0.760897` to `0.763680`.
+
+Phase18 added a narrow Area/Competence promotion for pages where the text layer
+emits a two-column rowspan-style table as an `Area` header, a `Competence`
+header, numbered left-list blocks, and one right-column numbered body block.
+Case `01030000000146` improved from TEDS `0.0` to `0.714286`; full200 TEDS
+rose from `0.580748` to `0.597754`, and overall rose from `0.763680` to
+`0.764969`.
+
+Phase19 tried promoting a single-column framework heading list in
+`01030000000149`, but it was rejected because full200 overall regressed from
+`0.764969` to `0.764452` despite a small TEDS gain.
+
+Phase20 added a narrow inline cation-observation table splitter for text blocks
+that contain a table caption, `Added cation`, `Relative Size & Settling Rates
+of Floccules`, and the known cation rows. Case `01030000000165` improved from
+TEDS `0.0` to `1.0`; full200 TEDS rose from `0.597754` to `0.621564`, and
+overall rose from `0.764969` to `0.766717`.
+
+Phase21 added a narrow PORT/SHIPCALLS column-stream merge for pages where the
+table detector already emits a two-row header but the port names and numeric
+Foreign/Domestic columns arrive as following text sections. Case
+`01030000000064` improved from TEDS `0.07619` to `0.918367`; full200 TEDS rose
+from `0.621564` to `0.641616`, and overall rose from `0.766717` to `0.769130`.
+
+Phase22 added a narrow Training Datasets fragment merge for pages where the
+title and two adjacent table fragments represent one multi-row header table.
+Case `01030000000187` improved from TEDS `0.0` to `0.653061`; full200 TEDS
+rose from `0.641616` to `0.657165`, and overall rose from `0.769130` to
+`0.770253`.
+
+Phase23 added a narrow arrow-flow table normalizer for the five-column
+`Genes in DNA` / `Protein` / `Characteristics` chart table where the text layer
+had already exposed the content but collapsed `Protein -> Characteristics` into
+one malformed column. Case `01030000000120` improved from TEDS `0.065676` to
+`1.0`; full200 TEDS rose from `0.657165` to `0.679411`, and overall rose from
+`0.770253` to `0.773042`.
+
+Phase24 added a narrow blank comparison table merge for the Mitosis/Meiosis
+worksheet case where row labels followed the detected two-column header as two
+text blocks. Case `01030000000119` improved from TEDS `0.145655` to `1.0`;
+full200 TEDS rose from `0.679411` to `0.699752`, and overall rose from
+`0.773042` to `0.774497`. MHS moved slightly down from `0.485812` to
+`0.485275`, so this is accepted as a table-quality/overall gain rather than an
+all-metric improvement.
+
+Phase25 added a narrow ECO competence-framework normalizer that splits the
+embedded framework title into a heading and folds the three-column bullet table
+back into a two-column framework table. Case `01030000000150` improved from
+TEDS `0.308854` to `0.892376` and MHS `0.0` to `0.346379`; full200 TEDS rose
+from `0.699752` to `0.713646`, MHS rose from `0.485275` to `0.488453`, and
+overall rose from `0.774497` to `0.776217`.
+
+Phase26 added a narrow national-initiatives long-text table normalizer for the
+ECO Circle recollection table where the text layer over-fragmented four columns
+into fifteen. Case `01030000000147` improved from TEDS `0.053808` to `1.0`;
+full200 TEDS rose from `0.713646` to `0.736174`, MHS rose from `0.488453` to
+`0.489770`, and overall rose from `0.776217` to `0.778841`.
+
+Phase27 added a narrow regulatory-narrative shard demotion for
+`01030000000080`, where decorative/layout fragmentation promoted ordinary
+chapter prose into Markdown tables. The focused guard keeps the regulatory
+cholesterol narrative as text and prevents the `| Shah. | ... |` shard table.
+Case `01030000000080` improved from overall `0.362170` to `0.540128` and NID
+from `0.391496` to `0.781736`; full200 NID rose from `0.896197` to
+`0.898148`, overall rose from `0.778841` to `0.779731`, TEDS stayed
+`0.736174`, and MHS moved slightly down from `0.489770` to `0.489455`.
+
+Overall, TEDS, and MHS now beat the historical initial acceptance baseline
+`overall=0.745414`, `TEDS=0.496416`, and `MHS=0.483837`. This is still not a
+claim of full OpenDataLoader hybrid/model parity. Runtime probe coverage now
+includes the TriageProcessor signal family for replacement-ratio,
+vector-line/table-border, suspicious-gap, large-image, aligned-line, and custom
+threshold decisions. The next high-impact gaps are multi-segment rowspan
+tables, OCR/image-only table content, chart/table distinction, remaining
+heading hierarchy misses, and broader reading-order/text normalization.
diff --git a/docs/parser/reports/opendataloader-full200-2026-06-23.md b/docs/parser/reports/opendataloader-full200-2026-06-23.md
new file mode 100644
index 00000000..8481cd3a
--- /dev/null
+++ b/docs/parser/reports/opendataloader-full200-2026-06-23.md
@@ -0,0 +1,137 @@
+# OpenDataLoader Full200 Report - 2026-06-23
+
+This report records the current DocTruth Rust `edge-fast` parser quality on the
+full OpenDataLoader Bench corpus. It is evidence of the current parser state,
+not a parity claim.
+
+## Commands
+
+Prediction:
+
+```bash
+printf '%s' '{
+ "command": "opendataloader_prediction",
+ "bench_dir": "third_party/opendataloader-bench",
+ "output_dir": "third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-2026-06-23",
+ "engine": "doctruth-rust-opendataloader-full200-2026-06-23",
+ "preset": "edge-fast",
+ "profile": "edge-fast",
+ "allow_full200": true,
+ "timeout_seconds": 30
+}' | cargo run --manifest-path runtime/doctruth-runtime/Cargo.toml --quiet --bin doctruth-runtime
+```
+
+Evaluation:
+
+```bash
+printf '%s' '{
+ "command": "opendataloader_evaluate_prediction",
+ "ground_truth_dir": "third_party/opendataloader-bench/ground-truth/markdown",
+ "prediction_dir": "third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-2026-06-23",
+ "output_path": "third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-2026-06-23/evaluation.json"
+}' | cargo run --manifest-path runtime/doctruth-runtime/Cargo.toml --quiet --bin doctruth-runtime
+```
+
+## Artifacts
+
+- DocTruth revision used for run: `c65f0e0`
+- Prediction directory:
+ `third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-2026-06-23/`
+- Prediction summary:
+ `third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-2026-06-23/summary.json`
+- Evaluation:
+ `third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-2026-06-23/evaluation.json`
+
+## Scores
+
+| Metric | Score |
+| --- | ---: |
+| Overall mean | 0.738756 |
+| NID mean | 0.859061 |
+| NID-S mean | 0.838722 |
+| TEDS mean | 0.475822 |
+| TEDS-S mean | 0.534886 |
+| MHS mean | 0.469231 |
+| MHS-S mean | 0.626041 |
+
+## Coverage And Runtime
+
+| Field | Value |
+| --- | ---: |
+| Documents | 200 |
+| Parsed | 199 |
+| Failed | 1 |
+| Missing predictions | 0 |
+| NID-counted docs | 200 |
+| TEDS-counted docs | 42 |
+| MHS-counted docs | 109 |
+| Total elapsed | 217820.636958 ms |
+| Mean per document | 1089.103185 ms |
+| Runtime profile | edge-fast |
+| Model-required routes | 0 |
+| Started model runtimes | 0 |
+
+## Failed Parse
+
+| Case | Error | Interpretation |
+| --- | --- | --- |
+| 01030000000165 | `PDF_EXTRACTION_FAILED` | Text layer was not extractable; output Markdown is empty. Needs OCR/model route for scanned or image-only pages. |
+
+## Bottom 30 Cases
+
+| Case | Overall | NID | TEDS | MHS | Primary bucket | Next action |
+| --- | ---: | ---: | ---: | ---: | --- | --- |
+| 01030000000165 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | OCR/text-layer | Route image-only pages to OCR instead of emitting empty Markdown. |
+| 01030000000141 | 0.003407 | 0.006814 | n/a | 0.000000 | OCR/layout | Preserve brochure text and visual reading order; current output is nearly empty. |
+| 01030000000110 | 0.259914 | 0.519828 | 0.000000 | n/a | Table/formula | Recover Reynolds formula and viscosity table structure. |
+| 01030000000107 | 0.303476 | 0.373557 | n/a | 0.233394 | Reading order | Improve multi-block reading order and heading hierarchy. |
+| 01030000000170 | 0.308225 | 0.616449 | 0.000000 | n/a | Table | Convert conservation-practice table to valid HTML/GFM structure. |
+| 01030000000150 | 0.315741 | 0.866902 | 0.000000 | 0.080321 | Table/heading | Preserve table structure and heading levels. |
+| 01030000000082 | 0.318828 | 0.624846 | 0.012810 | n/a | Table | Split appendix table text into clean table blocks. |
+| 01030000000146 | 0.332638 | 0.901961 | 0.000000 | 0.095954 | Heading/table | Avoid false headings inside framework table-like content. |
+| 01030000000149 | 0.336356 | 0.851013 | 0.000000 | 0.158055 | Table/heading | Recover table projection and suppress heading pollution. |
+| 01030000000185 | 0.339749 | 0.534851 | n/a | 0.144646 | Reading order | Improve block grouping and flow reconstruction. |
+| 01030000000168 | 0.348347 | 0.696694 | n/a | 0.000000 | Heading | Recover heading hierarchy for long educational content. |
+| 01030000000163 | 0.349335 | 0.523211 | n/a | 0.175459 | Reading order | Improve dense page line grouping and ordering. |
+| 01030000000147 | 0.352919 | 0.866042 | 0.000000 | 0.192714 | Table/heading | Recover table cells and avoid heading-level drift. |
+| 01030000000104 | 0.363752 | 0.727503 | n/a | 0.000000 | Heading | Add robust heading-tree reconstruction for this layout family. |
+| 01030000000187 | 0.374228 | 0.919607 | 0.000000 | 0.203076 | Table/heading | Improve TEDS for benchmark table pages. |
+| 01030000000183 | 0.376541 | 0.588088 | n/a | 0.164993 | Reading order | Improve flow segmentation and heading alignment. |
+| 01030000000084 | 0.391948 | 0.701251 | 0.082645 | n/a | Table | Recover appendix table rows and column spans. |
+| 01030000000200 | 0.400072 | 0.520773 | 0.489096 | 0.190347 | Mixed | Improve late-corpus mixed table plus heading recovery. |
+| 01030000000197 | 0.405490 | 0.914987 | 0.000000 | 0.301483 | Table | Table structure is the primary failure. |
+| 01030000000122 | 0.413279 | 0.807601 | 0.000000 | 0.432236 | Table | Recover table HTML/GFM projection. |
+| 01030000000199 | 0.437046 | 0.756651 | n/a | 0.117440 | Mixed | Improve block grouping and heading recovery. |
+| 01030000000144 | 0.441278 | 0.603798 | n/a | 0.278758 | Mixed | Improve text ordering and hierarchy. |
+| 01030000000154 | 0.446360 | 0.892720 | n/a | 0.000000 | Heading | Heading hierarchy is the dominant failure. |
+| 01030000000145 | 0.453519 | 0.574843 | n/a | 0.332195 | Reading order | Improve dense layout order and section grouping. |
+| 01030000000182 | 0.453656 | 0.894571 | 0.000000 | 0.466396 | Table | Table projection is missing or malformed. |
+| 01030000000058 | 0.462278 | 0.924556 | n/a | 0.000000 | Heading | Heading hierarchy is missing. |
+| 01030000000157 | 0.478196 | 0.956391 | n/a | 0.000000 | Heading | Heading hierarchy is missing. |
+| 01030000000179 | 0.491228 | 0.982456 | n/a | 0.000000 | Heading | Heading hierarchy is missing. |
+| 01030000000051 | 0.493500 | 0.725115 | 0.502764 | 0.252621 | Mixed | Table and heading metrics both need improvement. |
+| 01030000000133 | 0.494342 | 0.988683 | n/a | 0.000000 | Heading | Heading hierarchy is missing. |
+
+## Interpretation
+
+The run proves that the current Rust `edge-fast` path can process the full
+OpenDataLoader corpus without Python, Torch, Docling, or a resident model
+runtime. It also shows the current quality ceiling clearly:
+
+- Plain text extraction and many simple layouts are already strong enough to
+ keep the overall mean at `0.738756`.
+- Table structure is the largest quality gap. Cases with `TEDS = 0` dominate
+ the bottom list.
+- Heading hierarchy is the second major gap. Several cases have good NID but
+ `MHS = 0`.
+- OCR/text-layer handling is still required for image-only or non-extractable
+ pages; `01030000000165` produced an empty Markdown artifact.
+
+## Next Actions
+
+1. Add an OCR/model route for `PDF_EXTRACTION_FAILED` and empty-text pages.
+2. Prioritize table reconstruction for cases with `TEDS = 0`, starting with
+ `01030000000110`, `01030000000170`, `01030000000082`, and
+ `01030000000146`.
+3. Add heading hierarchy recovery tests for the MHS-zero family.
+4. Keep this report as the baseline for future OpenDataLoader parity work.
diff --git a/docs/parser/reports/opendataloader-hybrid-comparison-2026-06-23.md b/docs/parser/reports/opendataloader-hybrid-comparison-2026-06-23.md
new file mode 100644
index 00000000..b4a2fc7c
--- /dev/null
+++ b/docs/parser/reports/opendataloader-hybrid-comparison-2026-06-23.md
@@ -0,0 +1,43 @@
+# OpenDataLoader Hybrid Comparison - 2026-06-23
+
+## Inputs
+
+- Reference: `third_party/opendataloader-bench/prediction/opendataloader-hybrid/evaluation.json`
+- Candidate: `third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-2026-06-23/evaluation.json`
+- Command: `opendataloader_compare_reports`
+- Note: this report compares existing evaluation artifacts only; it does not rerun full200.
+
+## Summary
+
+| Metric | Reference | Candidate | Delta |
+| --- | ---: | ---: | ---: |
+| Overall | 0.906572 | 0.738756 | -0.167816 |
+| NID | 0.933731 | 0.859061 | -0.074670 |
+| TEDS | 0.927643 | 0.475822 | -0.451821 |
+| MHS | 0.820776 | 0.469231 | -0.351545 |
+
+## Coverage
+
+| Field | Value |
+| --- | ---: |
+| Compared documents | 200 |
+| Reference-only documents | 0 |
+| Candidate-only documents | 0 |
+
+Both reports cover the same 200 OpenDataLoader Bench documents. The deltas are
+therefore quality gaps, not corpus mismatch artifacts.
+
+## Bottom Regression Cases
+
+| Document | Overall Delta | NID Delta | TEDS Delta | MHS Delta |
+| --- | ---: | ---: | ---: | ---: |
+| `01030000000165` | -0.844331 | -0.860421 | -1.000000 | -0.672572 |
+| `01030000000170` | -0.649175 | -0.300824 | -0.997527 | n/a |
+| `01030000000082` | -0.640821 | -0.294452 | -0.987190 | n/a |
+| `01030000000110` | -0.619685 | -0.309729 | -0.929641 | n/a |
+| `01030000000104` | -0.595566 | -0.237021 | n/a | -0.954112 |
+| `01030000000168` | -0.579702 | -0.224557 | n/a | -0.934846 |
+| `01030000000185` | -0.570287 | -0.429586 | n/a | -0.710990 |
+| `01030000000084` | -0.559201 | -0.201048 | -0.917355 | n/a |
+| `01030000000147` | -0.548387 | -0.099680 | -1.000000 | -0.545483 |
+| `01030000000163` | -0.544425 | -0.454925 | n/a | -0.633924 |
diff --git a/docs/pdf-parser-runtime-prd.md b/docs/pdf-parser-runtime-prd.md
new file mode 100644
index 00000000..df0381f2
--- /dev/null
+++ b/docs/pdf-parser-runtime-prd.md
@@ -0,0 +1,2899 @@
+# DocTruth PDF Parser Runtime PRD
+
+Status: draft
+Owner: doctruthhq maintainers
+Scope: DocTruth parser/runtime layer
+Last updated: 2026-06-13
+
+## 0. Non-Negotiable Runtime Direction
+
+DocTruth parser ownership is quality-core first, Rust-shell first.
+
+```text
+Java/OpenDataLoader-compatible parser core:
+ owns the current PDF parsing quality path, PDFBox compatibility, text
+ extraction, reading order, layout/table heuristics, heading reconstruction,
+ evidence spans, parser warnings, source refs, and TrustDocument emission.
+
+Rust runtime shell:
+ owns warm process orchestration, benchmark execution, OpenDataLoader Bench
+ prediction packaging, resource accounting, model manifest/cache validation,
+ MNN worker protocol, and Python/Torch/Docling replacement.
+
+Python/OpenDataLoader original runners:
+ are oracle-only and may not become production fallback.
+```
+
+Java/OpenDataLoader-compatible parser core is the current quality source of truth.
+Rust owns the runtime shell and Python replacement boundary.
+Python/OpenDataLoader original runners are oracle-only.
+
+Rust parser-core replacement is a future ADR after benchmark parity, not the
+current default parser-quality architecture. Missing Rust runtime is an
+installation/configuration error for benchmark shell/model-worker execution,
+but it is not a reason to claim the Java parser-quality core is legacy-only.
+
+Current production model-worker direction:
+
+```text
+Production package:
+ doctruth-runtime
+ doctruth-mnn-model-worker
+
+Production runtime:
+ Java-owned parser-quality core
+ Rust-owned process/runtime orchestration
+ Rust-owned model manifest/cache validation
+ Rust-owned worker protocol and TrustDocument normalization
+
+Legacy/source-only oracle:
+ scripts/doctruth-onnx-model-worker
+ scripts/doctruth-slanext-table-worker
+ scripts/doctruth-rapidocr-mnn-worker
+```
+
+RapidOCR, SLANeXT/PaddleOCR, and ONNXRuntime Python worker scripts are no longer
+source-install or release-tarball production entrypoints. They can remain in the
+source tree only for migration comparison, differential oracle tests, or
+explicit opt-in historical smokes, and they fail closed unless
+`DOCTRUTH_ALLOW_PYTHON_ORACLE=1` is set by that test/oracle harness. Real MNN
+OCR/table/layout inference inside
+`doctruth-mnn-model-worker` is still an implementation task; the current Rust
+worker locks the production protocol, packaging, discovery, and fail-closed
+runtime boundary. Its doctor reports `protocolReady=true` and
+`inferenceReady=false` until real inference is wired. The only non-real parse
+path is the explicit `DOCTRUTH_MNN_WORKER_STUB=1` contract-smoke mode, whose
+output must be `NOT_AUDIT_GRADE`. Native MNN binding work is behind the optional
+`mnn-native` Cargo feature using `mnn-rs`; `doctruth-mnn-model-worker
+--probe-model /path/to/model.mnn` and
+`scripts/smoke-doctruth-mnn-native-probe.sh` are the acceptance entrypoints for
+real native MNN session/inference checks with executable model artifacts. The
+default build remains Rust runtime/protocol only until real model decoders are
+wired and benchmarked. Benchmark-only MNN artifacts with stripped weights do not
+count as native inference acceptance models.
+
+## 1. Summary
+
+DocTruth cannot be credible if its source evidence is wrong. The product promise
+is not merely "extract text from PDFs"; it is:
+
+```text
+Every extracted field can be traced to the correct source page, text span,
+layout region, table cell, and bounding box.
+```
+
+That means PDF parsing quality is a first-order product requirement. A wrong
+reading order, wrong table cell, wrong section boundary, or wrong bounding box
+breaks the evidence chain and makes downstream LLM extraction unverifiable.
+
+This PRD defines the next parser runtime direction: a high-accuracy,
+model-assisted, evidence-native PDF engine inspired by the runtime shape of
+projects such as Kreuzberg, Docling, MinerU, and OpenDataLoader PDF, while
+keeping DocTruth's own implementation, compatible licensing, and
+evidence/audit semantics.
+
+## 2. Problem
+
+Earlier DocTruth PDF parsing used a Java/PDFBox baseline. That baseline exposed
+real-world failure modes that directly damage evidence quality:
+
+```text
+multi-column reading order
+left/right resume layouts
+sidebar sections swallowing main-column text
+tables with missing or wrong cell boundaries
+borderless tables
+merged cells
+scanned PDFs requiring OCR
+headers/footers polluting source spans
+wrong bbox unions after section coalescing
+```
+
+The conclusion is not to discard Java/PDFBox before parity is proven. The
+correct near-term direction is to harden the Java/OpenDataLoader-compatible
+parser core, measure it against OpenDataLoader Bench, and let Rust replace the
+Python/Torch/Docling outer runtime. If these errors survive into
+`TrustDocument`, `EvidenceSpan`, or `Citation`, then the audit trail becomes
+formally present but substantively wrong.
+
+## 3. Product Thesis
+
+DocTruth should become an evidence-first document runtime:
+
+```text
+Java/PDFBox/OpenDataLoader-compatible parser-quality core
++ Kreuzberg-style Rust runtime shell and local model operations
++ Docling/MinerU-style layered document contracts
++ OpenDataLoader-style geometric reading order and safety filters
++ DocTruth-level citation, provenance, confidence, audit, and replay semantics
+```
+
+DocTruth should not compete on "number of supported file formats" first. It
+should compete on correctness of source grounding:
+
+```text
+field -> quote -> page -> line -> bbox -> table cell -> parser/model metadata
+```
+
+## 4. Benchmark Reference
+
+DocTruth should not merge reference projects as equal parser cores. It should
+use them as layered references:
+
+```text
+Rust PDF substrate:
+ pdf_oxide as a future Rust parser-module candidate for PDF bytes, object
+ parsing, text extraction, structure-tree-aware reading order, XY-Cut
+ column-aware reading order, page geometry, rendering, content-stream safety
+ checks, line-table heuristics, and bbox evidence.
+
+Geometry and reading order:
+ Java/PDFBox/OpenDataLoader-compatible processors first. OpenDataLoader-style
+ XY-Cut++ scenarios and filters are used as behavioral references where they
+ improve resume/sidebar/header/footer/table cases.
+
+Runtime and model operations:
+ Kreuzberg-style Rust runtime shell, language wrappers, local model cache,
+ model manifest, feature-gated heavy capabilities, and sidecar/worker handoff.
+
+Document representation:
+ Docling/MinerU-style lossless document model, readable block stream,
+ intermediate page/block/line/span trace, and lossy Markdown/HTML exports.
+
+Evidence and trust:
+ DocTruth-owned TrustDocument, TrustUnit, source refs, warnings, audit gates,
+ source maps, benchmark reports, and replay-ready artifacts.
+```
+
+This is how the system gets additive benefits instead of conflicting
+heuristics: each reference project informs one layer, and `TrustDocument`
+remains the single canonical contract that all parser observations must flow
+through.
+
+### Java Quality Core / Rust Shell Boundary
+
+The current work is not a direct PDFBox replacement. The immediate boundary is:
+
+```text
+Java quality core:
+ PDF bytes -> text/page geometry/rendering/table heuristics -> Java objects
+
+DocTruth parity target:
+ Java/PDFBox + OpenDataLoader-compatible processors
+ -> TrustDocument
+ -> Rust runtime shell for corpus/model/process packaging
+ -> OpenDataLoader Bench reports
+```
+
+Current implementation status:
+
+```text
+Java parser core owns:
+ current PDFBox text-layer extraction
+ page geometry and bbox evidence
+ layout and semantic section coalescing
+ table heuristics and borderless table recovery
+ TrustDocument emission
+
+Rust runtime shell owns:
+ OpenDataLoader Bench prediction packaging
+ corpus runner and resource accounting
+ MNN model-worker protocol
+ Python/Torch/Docling replacement boundary
+ future parser modules after parity is proven
+```
+
+This means Rust can remove the expensive Python/Docling/Torch outer runtime
+without prematurely discarding the Java/PDFBox parser quality path.
+
+### Reference Composition Guardrails
+
+The reference projects do not compete if each one stays in its lane:
+
+| Layer | Primary reference | DocTruth decision |
+| --- | --- | --- |
+| PDF substrate | Java/PDFBox + OpenDataLoader-compatible processors | Current parser-quality backend for bytes, text, page geometry, reading order, table heuristics, and source refs |
+| Runtime packaging | Kreuzberg | Rust shell first; Java owns current parser quality; Python/Docling/Torch are oracle-only |
+| Model operations | Kreuzberg | Local manifest/cache/doctor/worker handoff; heavy models opt-in |
+| Reading-order edge cases | OpenDataLoader PDF | Port/verify OpenDataLoader-style XY-Cut++ cases where they improve two-column/sidebar/cross-layout behavior |
+| Parser safety filters | OpenDataLoader PDF | Hidden/off-page/tiny/duplicate/background text filters must become Java parser-core warnings and audit gates first |
+| Unified document contract | Docling | Lossless canonical model, lossy exports, provenance-rich chunks |
+| Layered output products | MinerU | Markdown, flat content blocks, middle/trace structure, debug artifacts |
+| Evidence/trust | DocTruth | Source refs, quote hashes, bbox/table-cell citations, audit gates, benchmark reports, replay packages |
+
+Conflict rule:
+
+```text
+No external parser output is canonical.
+No external schema is canonical.
+No external project schema is canonical.
+TrustDocument is canonical.
+```
+
+Current guardrail status: `ArchitectureContractTest` asserts this composition
+table and conflict rule so future docs changes cannot quietly promote
+Kreuzberg, Docling, MinerU, OpenDataLoader, or PDFBox into the canonical
+DocTruth contract.
+
+If Java/PDFBox, an OpenDataLoader-style rule, a model worker, and a tagged-PDF
+structure tree disagree, DocTruth should not silently pick a winner in strict
+mode. It should record parser provenance, emit a warning when the disagreement
+is material, and block audit-grade output for severe cases such as uncertain
+reading order, missing visual bbox, low-confidence table structure, or failed
+quote anchoring.
+
+Kreuzberg is a useful engineering benchmark because it combines Rust core,
+language bindings, CLI/API/MCP deployment, ONNX-based layout detection, table
+structure recognition, model caching, and feature-gated heavy capabilities.
+
+Important Kreuzberg reference points:
+
+- Layout detection uses RT-DETR v2 over rendered page images and detects 17
+ document layout classes such as text, table, title, form, list item,
+ key-value region, headers, footers, captions, and figures.
+- The parser core direction is Rust/native. Current Kreuzberg-style Rust PDF
+ backend learning should track `pdf_oxide` for text/page extraction and
+ rendering-oriented Rust workflows. Other language packages should be
+ bindings, wrappers, or launchers around that core, not parallel parser
+ implementations.
+- Table structure recognition is configurable after table-region detection.
+ Kreuzberg documents these model choices:
+- Token-efficient wire formats are useful for LLM/RAG pipelines when full JSON
+ is too verbose.
+- GFM-quality Markdown/HTML rendering matters because downstream agents depend
+ on fenced code blocks, table nodes, escaping, and cross-format parity.
+- HTML-to-Markdown should avoid lossy intermediate round-trips when the source
+ is already HTML.
+- Streaming parsers are important for large documents and batch workloads.
+
+| Model | Role | Approx size | Intended use |
+| --- | --- | ---: | --- |
+| RT-DETR v2 | page layout detection | 169 MB | complex layouts, multi-column PDFs, forms, scanned PDFs |
+| TATR | table structure recognition | ~29-30 MB | default, fast, general-purpose tables |
+| SLANet-plus | table structure recognition | 7.78 MB | smallest local/edge model |
+| SLANeXT Wired | table structure recognition | ~365 MB | bordered/gridlined tables |
+| SLANeXT Wireless | table structure recognition | ~365 MB | borderless tables |
+| SLANeXT Auto | table structure recognition | ~737 MB | highest-accuracy mixed-table routing |
+
+Licensing constraint:
+
+Kreuzberg code is licensed under Elastic License 2.0. DocTruth must treat it as
+a product/architecture benchmark only. Do not copy implementation code into
+DocTruth. Model artifacts must be evaluated independently by their own licenses
+and provenance.
+
+References:
+
+- Kreuzberg Layout Detection Guide: https://docs.kreuzberg.dev/guides/layout-detection/
+- Kreuzberg Features: https://docs.kreuzberg.dev/features/
+- Kreuzberg layout models: https://huggingface.co/Kreuzberg/layout-models
+- Kreuzberg license: https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE
+
+Docling is a useful product and contract benchmark because it centers the
+pipeline around a unified document representation, parser backends, pipelines,
+lossless JSON serialization, lossy Markdown/HTML exports, provenance items, and
+chunking metadata for downstream AI use.
+
+Important Docling reference points:
+
+- A single document model represents text, tables, pictures, captions, lists,
+ hierarchy, headers/footers, layout bounding boxes, and provenance.
+- JSON is the lossless representation. Markdown and HTML are useful consumption
+ exports but cannot carry every metadata field.
+- Parser backends and pipelines construct and enrich the document model.
+- Chunks should carry enough metadata to preserve section context and source
+ provenance for RAG/agent use.
+
+References:
+
+- Docling document model and architecture: https://arxiv.org/html/2501.17887v1
+- Docling technical report: https://arxiv.org/html/2408.09869v3
+- Docling supported formats: https://docling-project.github.io/docling/usage/supported_formats/
+- Docling document converter: https://docling-project.github.io/docling/reference/document_converter/
+- Docling document reference: https://docling-project.github.io/docling/reference/docling_document/
+- Docling chunking concepts: https://docling-project.github.io/docling/concepts/chunking/
+
+OpenDataLoader PDF is a useful parser-algorithm benchmark because its current
+core is Apache-2.0, its output contract centers bounding boxes and reading
+order, and its deterministic parser includes XY-Cut++ reading-order logic,
+tagged-PDF structure-tree support, header/footer filtering, hidden/off-page
+content filtering, and table border/cluster processing.
+
+Important OpenDataLoader reference points:
+
+- `XYCutPlusPlusSorter` is Apache-2.0 and can be ported into DocTruth's Rust
+ runtime as a DocTruth-owned `reading_order::xy_cut_plus_plus` module.
+- Its XY-Cut++ behavior covers cross-layout elements, adaptive horizontal vs
+ vertical cuts, narrow-outlier filtering, two-column layouts, sidebars, and
+ row/column ordering.
+- Its content filtering removes hidden text, out-of-page content, duplicated
+ chunks, background artifacts, tiny text, invalid characters, and whitespace
+ noise before semantic grouping.
+- Its tagged-PDF path uses the PDF structure tree when available, instead of
+ always guessing reading order from geometry.
+- Its table flow combines bordered-table processing, cluster-based table
+ detection, cell normalization, nested table limits, and adjacent table
+ continuation checks.
+- Its batch guidance is operationally important: avoid repeatedly starting a
+ heavy parser process for every page or file when a persistent runtime or
+ batch call can amortize startup.
+
+Licensing constraint:
+
+OpenDataLoader PDF v2+ is Apache-2.0. If DocTruth ports implementation ideas or
+tests from OpenDataLoader, preserve the Apache header/attribution, add a NOTICE
+entry for Hancom/OpenDataLoader PDF, and record the source commit. Do not copy
+from pre-2.0 MPL-licensed revisions.
+
+References:
+
+- OpenDataLoader PDF: https://github.com/opendataloader-project/opendataloader-pdf
+- OpenDataLoader PDF license: https://github.com/opendataloader-project/opendataloader-pdf/blob/main/LICENSE
+- OpenDataLoader PDF NOTICE: https://github.com/opendataloader-project/opendataloader-pdf/blob/main/NOTICE
+- XY-Cut++ sorter: https://github.com/opendataloader-project/opendataloader-pdf/blob/main/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/readingorder/XYCutPlusPlusSorter.java
+- XY-Cut++ tests: https://github.com/opendataloader-project/opendataloader-pdf/blob/main/java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/readingorder/XYCutPlusPlusSorterTest.java
+
+OpenDataLoader Bench is vendored in
+`third_party/opendataloader-bench/` and should become DocTruth's
+parser-quality foundation, not a replacement for DocTruth's evidence benchmark.
+Its public benchmark focuses on the substrate quality that evidence depends on:
+
+```text
+reading order
+table fidelity
+heading hierarchy
+parse speed
+```
+
+The integration target is an adapter, not a fork:
+
+```text
+DocTruth Rust runtime
+ -> OpenDataLoader Bench prediction format
+ -> OpenDataLoader metrics and evaluation.json
+ -> DocTruth benchmark report external_metrics
+ -> DocTruth evidence/replay/audit metrics
+```
+
+Use OpenDataLoader Bench metrics as the lower parser-quality gate:
+
+```text
+NID reading-order/edit-distance quality
+TEDS table-structure similarity
+MHS heading hierarchy similarity
+speed parser throughput/latency
+```
+
+Then keep DocTruth-only evidence gates above it:
+
+```text
+bbox_coverage
+bbox_iou
+quote_anchor_accuracy
+evidence_span_accuracy
+source_map_validity
+audit_grade_pass_rate
+replay_integrity
+```
+
+Policy: a parser-quality failure must block audit-grade evidence. If reading
+order, table fidelity, heading hierarchy, or speed/resource gates are below the
+declared threshold for a corpus profile, downstream evidence spans may still be
+emitted for review, but they must not be promoted as audit-grade by default.
+
+Licensing and execution posture:
+
+- OpenDataLoader Bench is Apache-2.0 and is vendored with its license,
+ third-party notices, PDFs, ground-truth Markdown, prediction artifacts,
+ evaluator code, and charts.
+- Do not vendor or execute AGPL/GPL/commercial engines from the benchmark suite
+ in DocTruth CI. Keep such engines as external published prediction artifacts
+ only when useful for comparison.
+- The DocTruth runner should execute DocTruth's Rust runtime and permissive
+ reference engines only.
+
+References:
+
+- OpenDataLoader Bench: https://github.com/opendataloader-project/opendataloader-bench
+- OpenDataLoader Bench license: https://github.com/opendataloader-project/opendataloader-bench/blob/main/LICENSE
+
+### Benchmark Learning Status
+
+This table is the source of truth for what has been learned, implemented, and
+verified from the reference projects. "Complete" means the behavior is covered
+by DocTruth-owned tests or smoke scripts. "Partial" means the contract or
+adapter is implemented, but the broad accuracy or benchmark-parity requirement
+is still open.
+
+| Source | Learned capability | DocTruth status | Evidence | Remaining gap |
+| --- | --- | --- | --- | --- |
+| Kreuzberg | Rust runtime shell as the product runtime | Complete for shell/worker packaging, partial for broad parser-quality depth | `runtime/doctruth-runtime` has benchmark/corpus commands, model-worker handoff, packaged sidecar, Java CLI/MCP/SDK wiring, OCR/model routing contracts, resource reports, and OpenDataLoader Bench prediction packaging | Future parser-quality phases must harden the Java/OpenDataLoader-compatible core first, then prove any Rust parser replacement with benchmark evidence |
+| Kreuzberg | `pdf_oxide`-style Rust PDF backend | Experimental/secondary for parser-quality parity | Current Rust runtime has `pdf_oxide` text-layer extraction, span bbox evidence, column-order post-processing, page MediaBox geometry, rendered PNG page hashes, raw content-stream safety checks, and line-table extraction | Keep as a future parser module candidate; do not make it the current OpenDataLoader parity source of truth |
+| Kreuzberg | Local model cache and manifest-driven model handoff | Complete for cache/manifest/handoff and Rust-owned production worker protocol, partial for real MNN inference | Cache warm, SHA verification, model descriptors, runtime hints, Java and Rust doctor output, Java and Rust worker request metadata, `doctruth-mnn-model-worker` discovery, and Rust MNN worker protocol smoke | Real MNN OCR/table/layout inference, resource-profile reports, and broad accuracy/release artifact evidence are still pending |
+| Kreuzberg | RT-DETR-style layout detection | Complete for adapter/smoke and Rust runtime real-artifact entrypoint, partial for accuracy | Synthetic ONNX RT-DETR decoder smokes, opt-in public `Kreuzberg/layout-models` RT-DETR smoke, and `DOCTRUTH_RUNTIME_REAL_MODEL_ARTIFACTS=1` Rust runtime smoke | Broad labeled multi-layout corpus and calibrated layout-quality targets |
+| Kreuzberg | TATR-style table structure recognition | Complete for adapter/smoke and Rust runtime real-artifact entrypoint, partial for accuracy | Synthetic TATR decoder smokes, opt-in public Xenova TATR ONNX smoke with rendered-page input and row/column post-processing, and `DOCTRUTH_RUNTIME_REAL_MODEL_ARTIFACTS=1` Rust runtime smoke | Calibrated table normalization and labeled real-world table corpus |
+| Kreuzberg | SLANeXT/PaddleOCR-style server table recognition | Legacy oracle only after Rust MNN worker defaultization | Source-tree Python SLANeXT scripts remain available for historical comparison and opt-in smokes | Production table recognition should move into `doctruth-mnn-model-worker`; broad borderless/mixed-table corpus and calibrated table accuracy remain open |
+| Kreuzberg | Feature-gated heavy capabilities | Complete | Real model and OCR smokes are opt-in, skip safely by default, and release workflow has explicit real-model gate wiring | Remote release run evidence still needed before claiming release artifact quality |
+| Kreuzberg | Token-efficient wire format and GFM-quality output | Complete for local contracts | Compact LLM output, JSONL/Markdown renderers, source maps, streaming writer paths, GFM table rendering, HTML review anchors | Exact Kreuzberg TOON format is not copied or claimed |
+| Kreuzberg | Streaming and large-document posture | Partial | Writer-based render paths and Rust sidecar protocol exist | True streaming parse for multi-GB documents is not complete |
+| Docling | Unified document model | Complete for v1 contract | `TrustDocument`, `TrustUnit`, `TrustPage`, `TrustTable`, provenance, warnings, parser/model metadata | Contract can still grow for images/captions/forms as model coverage expands |
+| Docling | Lossless JSON with lossy Markdown/HTML exports | Complete | Deterministic JSON/audit/Markdown/HTML/plain/compact render contracts and source-map sidecars | Export parity should be rechecked when new unit kinds are added |
+| Docling | Provenance-first chunks for AI/RAG | Complete for v1 | Chunk/source-map/evidence contracts, compact LLM output, MCP evidence tools, citation verification | Broader chunking strategy can improve after real corpus feedback |
+| Docling | Parser backend/pipeline separation | Complete for shell/API separation, partial for later legacy-API migration | Parser presets, Java parser-quality backend, sidecar backend, local worker protocols, Rust runtime commands, SDK backend modes, MCP integration, and CLI output profiles | Keep Java/PDFBox/OpenDataLoader-compatible parsing as current quality core while Rust owns process/model/benchmark shell |
+| OpenDataLoader PDF | XY-Cut++ reading order | Partial for current Java quality core and Rust experimental path | Rust runtime has an attributed OpenDataLoader-style XY-Cut++ sorter, but current parity execution should copy/adapt OpenDataLoader behavior into Java parser-core tests first | Broaden against labeled real-world PDF corpus and benchmark before claiming parity |
+| OpenDataLoader PDF | Tagged-PDF structure tree preference | Complete for Rust MVP, partial for broad semantic tag export | Rust runtime uses `pdf_oxide` canonical page reading order so trustworthy Tagged-PDF structure trees beat geometric ordering, emits `parserRun.readingOrder` and `parseTrace.readingOrder`, and falls back to XY-Cut with a structured warning when `/MarkInfo /Suspects true` marks the tree unreliable | Broaden against real tagged PDFs and expose richer role/heading/list/table semantics through `TrustDocument` without making external parser schemas canonical |
+| OpenDataLoader PDF | Parser safety/content filters | Complete for Rust MVP, partial for broad visual validation | Reference content filters remove hidden/off-page/tiny/duplicate/background text and whitespace artifacts before grouping; Rust runtime now filters duplicate, whitespace-only, off-page, tiny, near-white/background-like, and invisible render-mode text-layer spans, emits severe parser-safety warnings, and blocks audit-grade output | Add robust rendered-page background comparison and broaden warning taxonomy against labeled real-world fixtures |
+| OpenDataLoader PDF | Table border/cluster heuristics | Complete for Rust MVP, partial for broad table accuracy | Rust runtime normalizes `pdf_oxide` text-spatial borderless table detection plus `pdf_oxide` content-stream line-table extraction into `TrustDocument` tables; covered behavior includes bordered grids, merged cells, row spans, and adjacent-page continuations | Broaden table metrics against labeled real-world fixtures and calibrate model-assisted table recognition |
+| OpenDataLoader Bench | Parser-quality foundation | Vendored, runner wired, first full baseline recorded locally | `third_party/opendataloader-bench/` supplies public parser-quality concepts for reading order, table fidelity, heading hierarchy, speed, ground-truth/prediction/evaluation artifacts, and NID/TEDS/MHS-style metrics. `scripts/run-doctruth-opendataloader-bench.sh` exports DocTruth Rust runtime predictions into OpenDataLoader Bench shape and runs the Rust evaluator by default; the official evaluator is explicit oracle-only. | Improve DocTruth Markdown/table/heading export and parser robustness until the real OpenDataLoader Bench baseline is competitive enough to act as an audit-grade parser-quality gate |
+| RapidOCR/MNN | Local OCR worker behind strict protocol | Complete for Rust-owned production protocol and packaging, partial for real MNN/labeled quality | `doctruth-mnn-model-worker` doctor, default discovery, source install/release packaging, Rust runtime model-worker smoke, and Python-free production worker metadata | Real MNN OCR inference and labeled real-world scanned-PDF OCR corpus |
+| DocTruth-specific | Evidence-grade audit and replay boundary | Complete for v1 contracts | Severe warning taxonomy, audit-grade blocking, source hash, bbox/table-cell evidence, review package, MCP document evidence tools | Parser accuracy still depends on broad labeled corpus and Rust-core migration |
+
+## 5. Goals
+
+### G1. Evidence-Grade PDF Structure
+
+DocTruth must emit a layout-aware `TrustDocument` with source objects that are
+stable enough for field-level citation:
+
+```text
+Page
+LayoutRegion
+TextBlock
+LineSpan
+TableRegion
+TableCell
+EvidenceSpan
+TrustUnit
+```
+
+Every object that can support an extracted field must carry:
+
+```text
+page number
+normalized bbox
+raw text
+reading-order index
+parser backend
+model backend when used
+confidence
+source hash or page image hash
+```
+
+Current Java/PDFBox baseline status: `PdfBoxParserBackend` now renders each page
+at 72 DPI with PDFRenderer and records the rendered page pixel dimensions plus a
+SHA-256 hash of the rendered PNG bytes in `TrustPage.imageHash`. The SDK
+`PdfPageImageRenderer.writePngs(...)` and CLI
+`doctruth render-pages -o ` can also persist deterministic
+`page-%04d.png` review artifacts plus a `page-images.json` manifest. The CLI
+`doctruth review-package -o ` writes a local static review
+package with `review.html`, `trust-document.json`, page image artifacts,
+`content_blocks.json`, `parse_trace.json`, `layout-debug.html`, and
+`span-debug.html`. The Phase 250 debug HTML artifacts carry trace-id data
+attributes that match `parse_trace.json`, so layout and span visual review uses
+the same trace ids as the machine-readable page/block/line/span trace.
+Rust `doctruth-runtime` now reads PDF MediaBox dimensions and default rendered
+PNG page bytes through `pdf_oxide`, then hashes those bytes for
+`TrustPage.imageHash`. `DOCTRUTH_RUNTIME_PAGE_RENDERER` remains an explicit
+override for compatibility checks; otherwise render failures fall back to a
+stable content/dimension hash. Runtime tests compare `imageHash` against
+`pdf_oxide` rendered PNG bytes.
+
+### G2. Model-Assisted Layout And Tables
+
+DocTruth should keep a fast heuristic/text-layer baseline, but add optional
+model-assisted paths for hard PDFs:
+
+```text
+layout detection
+table detection
+table structure recognition
+OCR routing
+region-aware reading order
+cell-level evidence spans
+```
+
+Current status: model-assisted presets now have an explicit local model-worker
+protocol instead of only falling through to heuristic PDFBox parsing. When
+`doctruth.model.command`, `DOCTRUTH_MODEL_COMMAND`, or `LOCAL_MODEL_COMMAND` is
+configured, `TrustDocumentParser` sends the preset, source hash, source bytes,
+and required model descriptors to the worker over JSON stdin/stdout. A
+`TABLE_LITE` contract test and CLI smoke prove a configured worker can return a
+full `TrustDocument` with model-produced `TrustTable` and `TABLE_CELL` units,
+`parserRun.backend=rust-sidecar+model-worker`, optional worker-level
+provenance such as `workerBackend`, and no `model_unavailable_fallback`.
+This is a runtime boundary and replay contract, not production RT-DETR/TATR/
+SLANeXT accuracy yet. DocTruth now also ships `scripts/doctruth-onnx-model-worker`,
+a local JSON model-worker adapter that imports ONNXRuntime, loads a
+SHA-verified cached ONNX artifact, executes one session run, and returns a
+`TrustDocument` through the same Java model-worker path. The ONNX smoke creates
+a tiny identity model locally and proves real ONNXRuntime loading/execution,
+cache warm, doctor, and parse integration. A second ONNX smoke now creates a
+TATR/DETR-like model with `pred_logits` and `pred_boxes`; the worker decodes
+the table/cell detections into `TrustTable` and `TABLE_CELL` units. A
+low-confidence table smoke verifies table/cell structure detections below
+`0.85` preserve the table for review/replay while emitting severe
+`table_structure_low_confidence` and downgrading the document to
+`NOT_AUDIT_GRADE`. A third ONNX smoke creates an RT-DETR/DETR-like layout
+model with the same `pred_logits`/`pred_boxes` shape and verifies
+`task=layout-detection` produces bbox-bearing layout `TEXT_BLOCK` units in
+reading order. A low-confidence layout smoke now verifies detections below
+`0.85` preserve the region for review/replay while emitting severe
+`layout_low_confidence` and downgrading the document to `NOT_AUDIT_GRADE`.
+These prove local decoder contracts over
+synthetic ONNX outputs, but not curated CI-owned model artifact coverage or
+real-world parser accuracy. `scripts/smoke-doctruth-real-rtdetr-artifact.sh`
+is an opt-in bridge for one public document-layout RT-DETR artifact: with
+`DOCTRUTH_REAL_RTDETR_SMOKE=1`, it downloads or reuses
+`Kreuzberg/layout-models` `rtdetr/model.onnx`, writes a SHA-pinned manifest,
+warms the local model cache, and runs the Java CLI model-worker harness with
+`task=layout-detection`. The worker supports RT-DETR's `images` and
+`orig_target_sizes` inputs, ImageNet-normalizes the rendered page image for the
+`images` input, and decodes `labels`/`boxes`/`scores` into bbox-bearing layout
+`TEXT_BLOCK` units using the documented 17 document layout classes.
+`scripts/smoke-doctruth-real-tatr-artifact.sh` is an opt-in bridge for one
+public TATR artifact: with
+`DOCTRUTH_REAL_TATR_SMOKE=1`, it downloads or reuses
+`Xenova/table-transformer-structure-recognition` `onnx/model_quantized.onnx`,
+writes a SHA-pinned manifest, warms the local model cache, and runs the existing
+real model artifact harness through Java CLI plus `doctruth-onnx-model-worker`.
+The worker now renders the first PDF page with local `pdftoppm` when available,
+preprocesses it through Pillow into a 4D `[1, 3, H, W]` tensor, and reports
+`metrics.inputSource=rendered_page`; non-vision and unavailable-renderer paths
+fall back to `synthetic_tensor`. For the public TATR artifact, the worker now
+uses the real Table Transformer label set (`table`, `table row`,
+`table column`, `table column header`, projected row headers, and spanning
+cells) and builds provisional cell evidence from row/column intersections
+instead of treating every non-table detection as a flat cell. The opt-in smoke
+asserts multi-row and multi-column cell output on a generated grid PDF. This
+proves real ONNXRuntime execution plus first-pass TATR post-processing, not
+production table accuracy yet because TATR-specific normalization calibration,
+SLANeXT parity, and labeled table accuracy are still separate work. The packaged
+ONNX adapter is split into
+an executable `doctruth-onnx-model-worker` shim and same-directory
+`doctruth_onnx_worker_lib.py` support module so decoder growth stays within the
+package boundary. `scripts/smoke-doctruth-runtime-real-model-artifacts.sh` is
+the Rust-runtime real artifact entrypoint: with
+`DOCTRUTH_RUNTIME_REAL_MODEL_ARTIFACTS=1`, it downloads or reuses the public
+RT-DETR and TATR ONNX artifacts, prepares SHA-pinned model manifests and the
+Rust runtime model cache, invokes `doctruth-runtime` `parse_pdf` with
+`DOCTRUTH_RUNTIME_MODEL_COMMAND=scripts/doctruth-onnx-model-worker`, and
+asserts `parserRun.backend=rust-sidecar+model-worker` while preserving the
+worker's original backend as `parserRun.workerBackend`. This proves public
+RT-DETR/TATR artifact execution can be controlled from the Rust runtime path.
+SLANeXT and OCR now have matching generated real-route Rust runtime smokes.
+This still does not prove broad production parser accuracy. The ONNX worker split keeps
+project LOC limits intact while source installs, release tarballs, Homebrew
+formulae, and release smoke tests still exercise the real packaged command.
+DocTruth now
+also ships `scripts/doctruth-slanext-table-worker`, a PaddleOCR/SLANeXT JSON
+model-worker adapter for `table-server` style table extraction. The packaged
+fake-runtime smoke proves worker doctor readiness, direct JSON worker output,
+Java CLI model-worker integration, and table-cell preservation without bundling
+PaddleOCR or SLANeXT model binaries. `scripts/smoke-doctruth-real-slanext-artifact.sh`
+is the opt-in real runtime hook for environments that have PaddleOCR/SLANeXT
+installed. The real smoke has been verified in an isolated Python 3.10 venv
+with PaddleOCR 3.7.0 and PaddlePaddle 3.3.1; the adapter handles PaddleOCR
+3.7 `TableRecResult.json.res` output, HTML-like table structure tokens, and
+flat 8-number quadrilateral bboxes. This proves runtime integration on a
+generated grid PDF, not broad SLANeXT table accuracy.
+`scripts/smoke-doctruth-real-model-suite.sh` is the single release/CI entrypoint
+for running the public real-model smoke set together. It defaults to a safe
+skip, and with `DOCTRUTH_REAL_MODEL_SUITE=1` runs RT-DETR, TATR, and SLANeXT;
+`DOCTRUTH_SLANEXT_PYTHON` can point only the SLANeXT step at an isolated
+PaddleOCR venv without disturbing the ONNXRuntime Python used by RT-DETR/TATR.
+The suite is included in source installs and release tarballs. The release
+workflow installs `poppler-utils`, ONNXRuntime/Pillow/Numpy, and
+PaddleOCR/Paddle, then runs the suite with `DOCTRUTH_REAL_MODEL_SUITE=1` before
+publishing release artefacts. These Python dependencies should remain pinned to
+a verified compatible set, currently ONNXRuntime 1.26.0, Pillow 12.x,
+`numpy<2.4`, PaddleOCR 3.7.0, and PaddlePaddle 3.3.1. Ordinary CI runs the
+suite's safe skip path to catch packaging regressions without downloading large
+models on every PR.
+
+Ordinary CI also runs `scripts/smoke-doctruth-parser-accuracy-seed-corpus.sh`.
+That smoke creates generated multi-layout, table, and OCR fixtures, writes a
+`qualityProfile: "parser-accuracy"` manifest with `multi-layout`, `table`,
+`ocr`, `bbox`, and `source-map` coverage tags, and gates the corpus through
+`benchmark-corpus`. It proves the parser-accuracy corpus contract and metric
+plumbing; it is not a substitute for the broad real-world human-labeled corpus.
+
+### G3. Rust Core With Java Wrapper Compatibility
+
+This is a product/runtime decision, not an optional implementation idea:
+DocTruth's parser/runtime core is Rust. Java remains only the SDK, CLI, API,
+packaging, lifecycle, and compatibility wrapper that calls into Rust through a
+native binding or a sidecar process.
+
+New parser-quality behavior must land in `runtime/doctruth-runtime` first:
+text extraction, page rasterization, layout detection, table recognition, OCR,
+model-cache verification, benchmark-corpus execution, parser warnings, and
+evidence reconciliation. Java may expose, package, adapt, and compatibility-test
+those capabilities, but it must not become the primary home for new parser
+logic.
+
+```text
+DocTruth Java wrapper
+ -> JNI/native library OR sidecar process
+ -> Rust parser runtime
+ -> evidence-native TrustDocument
+```
+
+The public Java API must not force users to understand Rust.
+
+Allowed Java responsibilities:
+
+```text
+stable SDK/API facade
+CLI command surface
+Maven packaging
+backward-compatible ParsedDocument/Citation adapters
+sidecar/native process lifecycle
+error mapping
+public API compatibility tests
+release packaging checks
+```
+
+Disallowed Java-first responsibilities after a future Rust-core ADR:
+
+```text
+new OCR/table/layout model execution logic
+new parser-quality benchmark ownership
+unmeasured parser-quality changes
+unattributed copied reference behavior
+audit-grade parser decisions without benchmark evidence
+```
+
+Current Java/PDFBox/OpenDataLoader-compatible parsing is the quality core. It is
+not a fallback. Rust owns the runtime shell around it and any future Rust parser
+replacement must be benchmark-proven before becoming default.
+
+### G4. Local-First Runtime
+
+DocTruth must work locally without network calls by default. Heavy models are
+downloaded only when explicitly enabled or when a preset requires them.
+
+```text
+default install: no large model download
+first layout run: download verified model
+doctor: verify cache, SHA256, backend availability
+offline mode: use existing cache only
+```
+
+Current status: `doctruth doctor --json` reports parser availability, model
+cache state, OCR worker readiness, and now configured model-worker readiness
+under `models.worker`. The model-worker readiness check uses the same
+local-first rule as OCR: it only probes an explicitly configured executable with
+`--doctor`, reports `available` separately from `ready`, carries structured
+`statusCode`/`message`, timeout, loaded model ids, and worker-reported
+`rssMb`/`peakMemoryMb`, and does not download models or run inference.
+Model-assisted parse requests sent to a configured worker now also include
+`modelCacheDirectory` and per-model `cachePath`, `cacheStatus`, `actualSha256`,
+and `actualSizeBytes`, derived from the local cache verifier. This gives real
+ONNX/TATR/SLANeXT workers a stable handoff point without treating missing or
+SHA-mismatched artifacts as ready. The ONNX worker direct resource smoke now
+asserts real parse-time `metrics.wallMs`, `metrics.inferenceWallMs`, `rssMb`,
+and `peakMemoryMb` from an actual ONNXRuntime session. This is worker-internal
+measurement, not an OS-level profiler or real production model load benchmark.
+Local model descriptors can also be supplied through `doctruth.model.manifest`
+or `DOCTRUTH_MODEL_MANIFEST`, keyed by preset id. The packaged model-worker
+smoke now creates a SHA-matched local artifact and manifest and verifies that a
+configured worker receives `cacheStatus=READY`.
+`doctruth doctor --json` now also reads `DOCTRUTH_MODEL_MANIFEST`, verifies all
+manifest model artifacts in `DOCTRUTH_MODEL_CACHE`, and reports `allReady` plus
+per-artifact identity, cache path, status, actual SHA-256, and actual size.
+This gives developers a no-inference preflight for local model readiness before
+they run a model-assisted preset. Manifest entries now also preserve runtime
+hints (`task`, `backend`, `format`, `precision`, `license`) through
+`doctruth cache warm --json`, `doctruth doctor --json`, and the local model
+worker request. That keeps model identity/SHA verification separate from model
+execution hints while giving future real ONNX/TATR/SLANeXT workers enough
+metadata to route the correct runtime path.
+
+Current Rust runtime status: `doctruth-runtime --doctor` also reports the local
+model pipeline directly, not only through the Java CLI wrapper. It includes
+native text extraction, document-structure/reading-order slots, layout/table/OCR
+capability slots, the configured model manifest path, model cache directory,
+per-preset model identities, `READY` / `MISSING` / `SHA_MISMATCH` cache status,
+actual SHA-256 and size, worker configured/available/ready separation,
+worker-reported memory fields, and runtime RSS/peak memory. This doctor path
+does not download models or run inference, so it remains safe for local-first
+install checks and CI capability reporting.
+
+### G5. Measurable Parser Quality
+
+Parser quality must be evaluated with fixtures and metrics, not screenshots
+alone.
+
+Required metrics:
+
+```text
+external_parser_quality:
+ opendataloader_nid
+ opendataloader_teds
+ opendataloader_mhs
+ opendataloader_speed
+
+doctruth_parser_quality:
+reading_order_f1
+section_boundary_f1
+table_region_iou
+table_cell_f1
+bbox_iou
+quote_anchor_accuracy
+evidence_span_accuracy
+ocr_text_accuracy
+parser_latency_p50/p95
+rss_peak_mb
+model_cache_size_mb
+
+doctruth_evidence_quality:
+ source_map_validity
+ audit_grade_pass_rate
+ replay_integrity
+```
+
+Current benchmark status: `ParserBenchmarkRunner` now reports
+`section_boundary_f1` by comparing recovered heading-like section boundary
+lines against expected Markdown boundaries, so corpus manifests can gate the
+PRD section-boundary metric directly. It reports `evidence_span_accuracy` by
+checking whether expected text lines are covered by actual units with evidence
+span ids, without requiring generated internal span ids to be stable across
+label and parser outputs. It also reports `ocr_text_accuracy` for OCR-backed
+`TrustDocument` output by comparing OCR region text against the expected
+Markdown text. Non-OCR cases score this metric as `1.0` so existing
+text-layer/table corpora are not penalized. Benchmark corpus manifests can set
+per-case `preset`, including `preset: "ocr"`, and the CLI `benchmark-corpus`
+smoke includes generated section-boundary, evidence-span, and scanned-PDF OCR
+cases gated by `section_boundary_f1`, `evidence_span_accuracy`, and
+`ocr_text_accuracy`. The same smoke must also include a wrong-label OCR corpus
+that exits non-zero and names the failing case plus `ocr_text_accuracy`, so OCR
+labels cannot silently drift. Benchmark cases also carry runtime observations and report
+`rss_peak_mb` plus `model_cache_size_mb`; `fromPdf(...)` records local JVM
+memory/cache observations as a fallback, while configured workers can supply
+stronger resource measurements through the benchmark case contract.
+Benchmark corpus manifests now also distinguish generated fixtures from
+human-labeled accuracy corpora. A manifest with `"kind": "human-labeled"` must
+include `labeling.labelSetVersion`, `labeling.reviewedAt`,
+`labeling.reviewer`, and non-empty `labeling.requiredMetrics`; every required
+metric must have an explicit `minimums` or `maximums` threshold. The CLI JSON
+output includes `kind`, `labelSetVersion`, and `requiredMetrics` so CI and
+release reports cannot silently treat generated fixture gates as human-labeled
+accuracy evidence. This completes the corpus contract and smoke gate for
+human-labeled labels. Parser-accuracy cases now also carry case-level
+`labelId` and `tags`, and `benchmark-corpus --json` emits those fields for
+each case so a passing CI report can be traced back to the reviewed label set
+and required coverage category. Parser-accuracy manifests must also declare
+`labeling.reviewType`, currently either `generated-seed` or `human-reviewed`.
+The generated seed corpus uses `generated-seed`; the future real-world accuracy
+corpus must use `human-reviewed`. The public W3C remote-PDF smoke now also
+declares `kind: "human-labeled"` and verifies this metadata through CLI JSON
+output, but it is still a small contract fixture rather than the actual large
+real-world labeled corpus. For a corpus that wants to claim parser accuracy,
+the manifest must add `qualityProfile: "parser-accuracy"` plus
+`labeling.requiredTags` and `labeling.minCasesPerTag`; the loader rejects the
+corpus when required coverage tags such as multi-layout, table, or OCR have too
+few cases. When `labeling.reviewType` is `human-reviewed`, the manifest must
+also declare `labeling.minTotalCases`, and the loader rejects reports with fewer
+total cases than that declared minimum. Human-reviewed parser-accuracy cases
+must also include `sourceSha256`; DocTruth verifies the SHA-256 for both local
+`source` files and remote `sourceUrl` cache entries before treating the label as
+valid. Generated seed corpora can remain small with
+`reviewType: "generated-seed"` and may omit source pins because they are
+plumbing gates, not accuracy evidence. Human-reviewed parser-accuracy manifests
+must declare the core parser-quality metric set in `labeling.requiredMetrics`:
+`reading_order_f1`, `quote_anchor_accuracy`, `bbox_coverage`, `bbox_iou`,
+`evidence_span_accuracy`, `table_cell_f1`, and `ocr_text_accuracy`. Each
+declared metric must still have an explicit threshold, even when a generated
+contract fixture uses a conservative threshold such as `bbox_iou: 0.0`; broad
+accuracy claims require stronger human-reviewed thresholds and recorded corpus
+runs. Human-reviewed parser-accuracy manifests must also declare the core
+coverage tags in `labeling.requiredTags`: `multi-layout`, `table`, `ocr`,
+`bbox`, and `source-map`; this prevents a broad corpus from passing while
+silently omitting a major document class or evidence surface. CLI JSON emits
+`qualityProfile`,
+`reviewType`,
+`requiredTags`, `minCasesPerTag`, and `minTotalCases` when present, so CI
+reports can prove corpus scale, coverage, and label-review posture instead of
+only proving that thresholds passed on a small fixture.
+
+### G6. LLM-Efficient And Streaming Runtime
+
+DocTruth must support AI consumption without making callers choose between
+verbose lossless JSON and ungrounded plain text.
+
+Required capabilities:
+
+```text
+lossless TrustDocument JSON for audit and replay
+compact evidence wire format for LLM/RAG pipelines
+GFM-quality Markdown output
+HTML review output with stable anchors
+HTML passthrough when source HTML can be converted directly
+streaming parser and renderer paths for large files
+```
+
+The compact wire format may learn from TOON-style serialization, but DocTruth
+should not commit to Kreuzberg's naming or exact format until licensing,
+interoperability, and parser-contract requirements are reviewed. The product
+requirement is token-efficient, deterministic, evidence-preserving
+serialization.
+
+## 6. Non-Goals
+
+DocTruth should not become:
+
+```text
+a general RAG framework
+a general document chatbot
+a vector database wrapper
+a hosted parser SaaS by default
+a clone of Kreuzberg
+a wholesale wrapper around Kreuzberg internals
+a confused merge of Kreuzberg, Docling, MinerU, and OpenDataLoader pipelines
+```
+
+DocTruth may support multiple formats, but PDF evidence correctness is the
+priority because PDF is where citation grounding most often fails.
+
+## 7. User Experience
+
+### SDK / Wrapper API
+
+The Java API is a wrapper around the Rust runtime. It is not the parser owner.
+Calls that parse into `TrustDocument` must route to `doctruth-runtime` by
+default.
+
+```java
+var doc = DocTruth.withProvider(provider)
+ .parsePdf("resume.pdf")
+ .withParser(ParserPreset.STANDARD)
+ .parse();
+
+var result = doc.extractJson(schema)
+ .withEvidence()
+ .runJson();
+```
+
+For parser-only SDK use, the static entrypoint must also accept an explicit
+parser preset:
+
+```java
+var doc = TrustDocumentParser.parse(path, ParserPreset.STANDARD);
+```
+
+If a model-assisted preset such as `STANDARD`, `TABLE_LITE`, `TABLE_SERVER`, or
+`OCR` is requested while required local models are unavailable, the Rust runtime
+may still emit a heuristic `TrustDocument` for inspection, but it must include a
+severe `model_unavailable_fallback` parser warning and evaluate as
+`NOT_AUDIT_GRADE`. The caller must never receive silent heuristic success for a
+requested model-assisted parse. Java must not implement an independent
+model-assisted parser path.
+
+### CLI
+
+```bash
+doctruth parse resume.pdf --preset standard --out trust-document.json
+doctruth parse resume.pdf --layout --table-model tatr --bboxes
+doctruth doctor models
+doctruth cache warm --model tatr
+```
+
+### Output Formats
+
+DocTruth parser output must serve multiple consumers. The canonical internal
+shape is `TrustDocument` JSON, but the most common downstream consumer may be an
+LLM or agent. Markdown is therefore a first-class product output, not a demo
+format.
+
+Required output modes:
+
+| Format | Primary consumer | Requirement |
+| --- | --- | --- |
+| JSON | SDKs, storage, audit pipelines | Lossless structure with pages, regions, tables, spans, parser/model metadata |
+| Markdown | LLMs, agents, human review | Reading-order text with headings, lists, tables, and stable evidence anchors |
+| Content Blocks JSON | LLM/RAG ingestion, cleanup, indexing | Flat reading-order blocks derived from the canonical parse |
+| Parse Trace JSON | parser QA, audit debugging, sourceRefs | Page -> block -> line -> span intermediate evidence layer |
+| HTML | review UI, bbox overlays | Layout-aware visual inspection with source regions and table cells |
+| JSONL | batch/indexing pipelines | One source object, block, table, cell, or evidence span per line |
+| Audit JSON | compliance/replay systems | Signed or hashable extraction evidence package |
+| Compact Wire | LLM/RAG pipelines | Token-efficient deterministic representation of evidence-bearing content |
+
+Markdown must preserve source grounding. It should not flatten the document into
+untraceable prose. Every block that can be cited should carry a stable anchor:
+
+```markdown
+## Work Experience {#ev:span_042 page=1 bbox="320,140,910,410"}
+
+Executive, Quality Assurance
+Malaysia University of Science and Technology | Jun 2025 - Present
+
+| Company | Role | Dates |
+| --- | --- | --- |
+| IMC Industries | Finance Admin | Sept 2024 - Present |
+
+```
+
+However, not every consumer wants anchors inline. DocTruth output must separate
+the canonical evidence-preserving representation from clean consumption
+renderings:
+
+```text
+canonical output
+ lossless, evidence-preserving, replayable, contains anchors and metadata
+
+clean output
+ easy to clean, easy to chunk, easy for LLMs to consume, minimal syntax noise
+```
+
+Pure Markdown mode is allowed and useful:
+
+```text
+markdown_clean
+ no inline evidence anchors
+ no HTML comments
+ no bbox metadata
+ no parser/model metadata in the body
+ stable page/section breaks only when useful
+```
+
+But clean output must be derived from the same canonical parse. The caller can
+choose to omit evidence from the rendered body, but DocTruth should still be
+able to emit a sidecar source map when requested:
+
+```text
+document.md
+document.doctruth-map.json
+```
+
+The source map links clean Markdown offsets back to evidence spans:
+
+```json
+{
+ "content_hash": "sha256:...",
+ "anchors": [
+ {
+ "markdown_start": 128,
+ "markdown_end": 244,
+ "evidence_span_id": "span_042",
+ "page": 1,
+ "bbox": [320, 140, 910, 410]
+ }
+ ]
+}
+```
+
+DocTruth should expose MinerU-style layered parser products without copying the
+MinerU schema verbatim:
+
+```text
+markdown_clean
+ final human/LLM-readable rendering
+ no evidence required in body
+
+content_blocks.json
+ flat reading-order block stream
+ best default for LLM/RAG ingestion and cleanup
+
+parse_trace.json
+ page -> block -> line -> span intermediate evidence layer
+ best default for parser QA, sourceRefs, bbox debugging, and replay inspection
+
+trust.json
+ canonical DocTruth replay/evidence contract
+ stable public object model for SDKs, MCP, MemTruth, and audit exports
+```
+
+The split matters because Markdown alone is not an evidence source. It is a
+rendering. `content_blocks.json` is allowed to be easy to clean and consume.
+`parse_trace.json` must preserve the parser's intermediate observations,
+including discarded blocks and low-confidence spans, so bugs in reading order,
+multi-column layout, sidebars, headers/footers, OCR, and table segmentation can
+be replayed and debugged without rerunning the parser.
+
+`content_blocks.json` should contain only readable content in final reading
+order. Typical block types:
+
+```text
+text
+heading
+list
+table
+image
+chart
+equation
+code
+header
+footer
+page_number
+aside_text
+```
+
+Each content block should carry:
+
+```text
+block_id
+type
+page
+bbox
+reading_order
+text or structured body
+heading_level when applicable
+source_unit_ids[]
+evidence_span_ids[]
+warnings[]
+```
+
+`parse_trace.json` should preserve the deeper intermediate structure:
+
+```text
+pages[]
+ page_index
+ page_size
+ preproc_blocks[]
+ reading_blocks[]
+ discarded_blocks[]
+ images[]
+ tables[]
+ equations[]
+
+block
+ block_id
+ type
+ bbox
+ reading_order
+ confidence
+ model_run_id
+ lines[]
+
+line
+ line_id
+ bbox
+ text
+ spans[]
+
+span
+ span_id
+ type
+ content
+ bbox
+ score
+ source_object_id
+ evidence_span_id
+```
+
+The parser should also emit visual QA artifacts equivalent in purpose to
+layout/span debug PDFs:
+
+```text
+layout debug artifact
+ visualizes layout blocks and reading order
+
+span debug artifact
+ visualizes text/OCR spans, dropped text, equations, and segmentation
+```
+
+Current Phase 250 status: `doctruth review-package` writes
+`content_blocks.json`, `parse_trace.json`, `layout-debug.html`, and
+`span-debug.html` alongside the canonical review package files. The debug HTML
+uses `data-trace-block-id`, `data-trace-line-id`, and `data-trace-span-id`
+attributes that are verified against `parse_trace.json`. This satisfies the
+review-package visual trace artifact contract, but it remains a deterministic
+projection from the current parser contract. It is not a claim that Rust-native
+real model/OCR execution, production parser-model accuracy, or the broad
+human-reviewed parser accuracy corpus are complete.
+
+Current Rust runtime real-model handoff status: the runtime has a
+safe-by-default smoke,
+`scripts/smoke-doctruth-runtime-real-model-suite.sh`, that routes
+`doctruth-runtime` `parse_pdf` through `DOCTRUTH_RUNTIME_MODEL_COMMAND`,
+verifies model-assisted parser metadata, and can be pointed at a compatible
+real worker with `DOCTRUTH_RUNTIME_REAL_MODEL_COMMAND`. This proves the Rust
+runtime is the control point for model-assisted parsing. It does not by itself
+prove production RT-DETR, TATR, SLANeXT, or OCR model accuracy; those still
+require opt-in real artifact runs and labeled corpus reports.
+
+Current Rust runtime model-worker status: `doctruth-runtime` defaults
+model-assisted `table-lite`, `table-server`, and `ocr` routes to
+`doctruth-mnn-model-worker` when a model route is selected and no explicit
+worker command is configured. The production install and release package include
+the Rust runtime and Rust MNN worker only. Legacy Python RapidOCR,
+SLANeXT/PaddleOCR, and ONNXRuntime scripts remain source-tree oracle tools for
+migration comparison and opt-in historical smokes, and their entrypoints require
+`DOCTRUTH_ALLOW_PYTHON_ORACLE=1`. The current Rust MNN worker
+locks the protocol, default discovery, fail-closed MNN-only model acceptance,
+and `TrustDocument` normalization. Without explicit stub mode it rejects
+model-assisted parse requests with `mnn_inference_unavailable` until real MNN
+inference is implemented. Stub output is severe-warning, non-audit-grade output
+for contract smokes only. The optional `mnn-native` feature verifies that
+`mnn-rs` can compile as the native binding seam without changing default runtime
+weight. `--probe-model` verifies native MNN session creation and inference when
+given a real executable `.mnn` artifact; benchmark-only or shape-only MNN files
+with stripped weights are rejected by MNN and do not satisfy acceptance. Real
+OCR/table decoder wiring and broad labeled OCR/table accuracy are still open
+implementation and evaluation work.
+
+Current parser-accuracy corpus status: JSON and readable
+`benchmark-corpus` output expose `kind`, `qualityProfile`, `reviewType`,
+`labelSetVersion`, `requiredMetrics`, `requiredTags`, `minCasesPerTag`,
+`minTotalCases`, and per-case `labelId`/`tags`. This makes generated and
+human-reviewed parser accuracy runs auditable in CI logs, but it does not
+replace the missing broad public human-reviewed PDF corpus.
+
+For LLM consumption, Markdown should support:
+
+```text
+stable heading hierarchy
+reading-order-correct sections
+GFM-compatible fenced code blocks
+GFM-compatible table nodes
+safe bracket and pipe escaping
+tables rendered as Markdown tables when structure is reliable
+HTML table fallback when rowspan/colspan cannot be represented safely
+inline evidence anchors
+page breaks
+low-confidence warnings
+source span ids
+token-budget-friendly chunking
+```
+
+Markdown, HTML, plain text, and compact wire output must be rendered from the
+same `TrustDocument` source, with cross-format parity checks for headings,
+tables, lists, code blocks, anchors, and warnings.
+
+When the source is HTML, DocTruth should preserve high-quality HTML-to-Markdown
+conversion output directly where possible instead of converting through an
+intermediate representation that destroys heading levels, tables, links,
+bracket escaping, or code blocks. The canonical `TrustDocument` still records
+the source map and parser provenance.
+
+The parser must expose output profiles:
+
+```text
+markdown_llm
+ compact Markdown for model context; anchors may be inline or sidecar
+
+markdown_review
+ human-readable Markdown with page markers and warnings
+
+markdown_clean
+ pure Markdown body optimized for LLM ingestion and downstream cleaning
+
+markdown_anchored
+ Markdown body with inline evidence anchors for citation-aware agents
+
+plain_text
+ clean text and tab-separated table rows for cleanup, keyword search, and
+ simple LLM context; not audit-grade without JSON/source-map sidecars
+
+json_full
+ lossless parser output
+
+json_evidence
+ compact evidence spans for DocTruth/MemTruth ingestion
+
+html_review
+ visual review surface with bbox anchors and page-scoped overlay layers
+
+compact_llm
+ token-efficient evidence-preserving wire format for LLM/RAG pipelines;
+ preserves optional bbox metadata for citeable units and supports writer-based
+ output for file/export paths
+```
+
+Current SDK streaming writer status:
+
+```text
+TrustDocument.writeMarkdownClean(writer)
+TrustDocument.writeMarkdownAnchored(writer)
+TrustDocument.writeMarkdownReview(writer)
+TrustDocument.writePlainText(writer)
+TrustDocument.writeJsonLines(writer)
+TrustDocument.writeCompactLlm(writer)
+TrustDocument.writeJsonFull(writer)
+TrustDocument.writeJsonEvidence(writer)
+TrustDocument.writeAuditJson(writer)
+TrustDocument.writeHtmlReview(writer)
+TrustDocument.writeMarkdownSourceMap(writer)
+TrustDocument.writeCompactLlmSourceMap(writer)
+```
+
+These writer APIs must be byte-identical to their string-returning counterparts
+while avoiding one full-payload write into caller-owned writers. Parser
+ingestion still materializes a `TrustDocument`. CLI `--out` file export now
+routes all current TrustDocument output formats through writer paths, and
+TrustDocument stdout output uses the same writer dispatch. Source-map sidecar
+file serialization also uses a writer path, and SDK/CLI source-map sidecar
+writers can write directly from `TrustDocument` without requiring callers to
+materialize a `TrustRenderedDocument`. The compatibility
+`toMarkdownWithSourceMap()` / `toCompactLlmWithSourceMap()` APIs still return
+`TrustRenderedDocument`, and source-map JSON still includes full rendered text
+by contract. Canonical and evidence hash inputs use writer-backed digest paths
+instead of aggregate JSON strings. Benchmark size metrics use writer-backed byte
+counters for full JSON and compact LLM output. `verify-source-map` hashes
+rendered and source files with streaming file reads. CLI parse and SDK path
+parse source hashing also use streaming file reads. SDK input-stream parsing now
+copies input incrementally into a temporary file instead of calling
+`InputStream.readAllBytes()`, then uses the same Rust-runtime path as file
+parsing so source hashes and page-image metadata remain consistent. The
+byte-array upload API still necessarily receives bytes already materialized by
+the caller. Java/PDFBox/OpenDataLoader-compatible parsing remains the current
+quality backend while Rust owns shell/runtime behavior.
+
+LLM-facing Markdown must be deterministic: the same parser version, preset,
+model versions, and source hash should produce byte-stable output unless the
+caller opts into non-deterministic post-processing.
+
+### Cleanability Requirements
+
+All rendered outputs must be easy to clean and post-process:
+
+```text
+no random IDs in visible body unless explicitly requested
+stable whitespace normalization
+stable heading levels
+stable table formatting
+no hidden proprietary markers in clean modes
+no irreversible lossy rewrite unless warning is emitted
+sidecar source maps instead of inline noise when requested
+round-trip hash linking between clean output and canonical parse
+cross-format parity for headings, tables, lists, links, code blocks, and anchors
+streaming render support for large documents
+```
+
+DocTruth should expose cleaning-safe flags:
+
+```bash
+doctruth parse resume.pdf --format markdown --profile clean
+doctruth parse resume.pdf --format markdown --profile anchored
+doctruth parse resume.pdf --format markdown --profile clean --source-map
+```
+
+Clean mode is not audit-grade by itself. It is a consumption view. Audit-grade
+status belongs to the canonical parse plus evidence map.
+
+### MCP / Skill Runtime
+
+The MCP tool should expose document evidence primitives, not just raw text:
+
+```text
+doctruth.parse_document
+doctruth.get_layout_regions
+doctruth.get_table_cells
+doctruth.get_evidence_span
+doctruth.verify_citation
+```
+
+When MemTruth uses DocTruth as a sidecar, it should receive evidence-native
+objects:
+
+```text
+SourceDocument
+EvidenceSpan
+ClaimCandidate
+TableCellEvidence
+ReplayObject
+```
+
+## 8. Runtime Presets
+
+### `lite`
+
+Default local mode. No heavy model download.
+
+```text
+PDF text layer
+heuristic line/block grouping
+basic table heuristics
+page/line/bbox when available
+```
+
+Acceptance:
+
+```text
+single-column PDFs parse correctly
+simple resumes preserve section boundaries
+no model cache required
+```
+
+### `standard`
+
+Default quality mode for serious extraction.
+
+```text
+text-layer parser
+layout detection when heuristics are uncertain
+TATR table recognition
+model cache verification
+```
+
+Acceptance:
+
+```text
+multi-column reading order improves over lite
+table region and common cell structure are preserved
+citations can point to table cells or layout regions
+```
+
+### `table-lite`
+
+Smallest table model mode.
+
+```text
+SLANet-plus or equivalent small model
+resource-constrained local environments
+fast approximate table structure
+```
+
+Acceptance:
+
+```text
+small model cache
+reasonable accuracy on simple bordered tables
+clear confidence degradation on hard tables
+```
+
+### `table-server`
+
+High-quality table mode.
+
+```text
+SLANeXT Wired/Wireless/Auto or equivalent licensed model set
+GPU/CoreML/CUDA/TensorRT when available
+cell-level table evidence
+```
+
+Acceptance:
+
+```text
+borderless and merged-cell tables improve materially over standard
+model metadata is written into audit JSON
+```
+
+### `ocr`
+
+Scanned PDF mode.
+
+```text
+page rasterization
+OCR backend plugin
+layout detection
+text-region and bbox reconciliation
+```
+
+Acceptance:
+
+```text
+scanned pages produce source spans with confidence
+low-confidence OCR never becomes silent audit-grade evidence
+ParserPreset.OCR routes v1 TrustDocument parsing through the configured local OCR worker
+doctruth parse/review-package --preset ocr produce OCR_REGION units with OCR provenance
+OCR unit confidence is propagated from the local worker into TrustUnitEvidence
+OCR confidence below 0.85 emits severe ocr_low_confidence and blocks audit-grade
+```
+
+Local OCR runtime strategy:
+
+```text
+Rust runtime owns the stable worker protocol, page rasterization, confidence
+gate, and TrustDocument reconciliation. Java SDK/CLI wrappers may launch,
+configure, and error-map the runtime, but must not own independent OCR evidence
+logic.
+
+RapidOCR/MNN is the preferred first local worker implementation candidate
+because it can run locally without calling a hosted OCR API, but it must be
+wrapped behind the DocTruth JSON stdin/stdout worker protocol and verified by
+doctor/smoke before being treated as available.
+
+The generic Java jar must not bundle OCR model binaries by default. Model files
+belong in an explicit local cache or user-supplied worker install, with SHA-256,
+engine name, model version, device, precision, timeout, and fallback recorded in
+ParserRun/model metadata.
+```
+
+RapidOCR/MNN acceptance:
+
+```text
+doctruth doctor --json reports a real rapidocr-mnn worker as executable and ready
+doctruth parse scanned.pdf --preset ocr works with that worker without Python import errors
+worker stdout carries text, per-region bbox, page number, confidence, engine, and warnings
+low-confidence worker output remains reviewable but not audit-grade
+smoke covers both success and low-confidence paths with the real adapter contract
+raw rapidocr CLI failures are surfaced as structured worker_unavailable or worker_protocol_error warnings
+```
+
+Legacy adapter status: `scripts/doctruth-rapidocr-mnn-worker` is a DocTruth-owned
+JSON worker adapter around RapidOCR kept for migration comparison and source-only
+oracle smokes. It is not packaged by the default source install or release
+tarball. Its `--doctor` self-test still separates executable availability from
+runtime readiness, and the adapter still handles RapidOCR 3.8-style array-like
+`boxes` / `txts` / `scores` output without NumPy truth-value failures.
+`scripts/smoke-doctruth-rapidocr-real.sh` is an
+opt-in real runtime smoke: when `DOCTRUTH_RAPIDOCR_REAL_SMOKE=1` is set, it
+creates or reuses an isolated venv, installs RapidOCR plus the ONNXRuntime
+backend, checks worker `--doctor`, runs direct OCR, then verifies Java CLI
+`parse --preset ocr` over a generated scanned PDF. Strict MNN backend readiness
+is now separately smoke-tested by `scripts/smoke-doctruth-rapidocr-mnn-backend.sh`:
+when `DOCTRUTH_RAPIDOCR_BACKEND=mnn` is set, worker `--doctor` must distinguish
+RapidOCR availability from actual `MNN`/`mnn` module availability and report
+`backend=mnn`, `backendReady`, and `backendVersion`.
+
+## 9. Core Data Contracts
+
+### Naming
+
+`ParsedDocument` is an implementation-flavored name. The product contract should
+use `TrustDocument`.
+
+```text
+TrustDocument
+ canonical, evidence-carrying document representation
+
+ContentBlock
+ flat reading-order block for LLM/RAG ingestion and cleanup
+
+ParseTrace
+ parser intermediate evidence layer with page/block/line/span observations
+
+TrustUnit
+ smallest stable citeable unit inside a TrustDocument
+
+ParsedDocument
+ optional internal or backward-compatible Java implementation name
+```
+
+Do not treat `TrustDocument` as automatically trusted. It is a document object
+that carries trust evidence, parser provenance, warnings, and audit-gate state.
+Whether it is audit-grade is decided later by the evidence gate.
+
+Avoid `TrustedDocument` for the core type because it overclaims. A document with
+severe parser warnings is still a `TrustDocument`, but it is not audit-grade.
+
+Use `TrustUnit` for the smallest citeable atom that can support downstream
+evidence. A `TrustUnit` may be backed by a text block, line span, table cell,
+figure caption, key-value region, or OCR region.
+
+### TrustDocument
+
+```text
+doc_id
+source_filename
+source_hash
+pages[]
+sections[]
+tables[]
+metadata
+parser_run
+outputs[]
+audit_grade_status
+warnings[]
+```
+
+### ContentBlock
+
+```text
+block_id
+document_id
+type
+page
+bbox
+reading_order
+heading_level
+text
+body
+source_unit_ids[]
+evidence_span_ids[]
+warnings[]
+```
+
+`ContentBlock` is not the audit source of truth. It is a clean, flat,
+reading-order projection for LLM/RAG consumers. It must always be derivable from
+`TrustDocument` plus `ParseTrace`, and every block that is citeable must point
+back to source units or evidence spans.
+
+### ParseTrace
+
+```text
+trace_id
+document_id
+parser_run_id
+pages[]
+warnings[]
+```
+
+`ParseTrace` is the audit/debug intermediate layer. It is allowed to be more
+verbose and more parser-shaped than `TrustDocument`, but it must be deterministic
+enough for tests, replay, and visual QA.
+
+### TracePage
+
+```text
+page_index
+page_number
+page_size
+preproc_blocks[]
+reading_blocks[]
+discarded_blocks[]
+images[]
+tables[]
+equations[]
+```
+
+### TraceBlock
+
+```text
+block_id
+type
+bbox
+reading_order
+confidence
+model_run_id
+lines[]
+source_unit_ids[]
+evidence_span_ids[]
+warnings[]
+```
+
+### TraceLine
+
+```text
+line_id
+bbox
+text
+spans[]
+```
+
+### TraceSpan
+
+```text
+span_id
+type
+content
+bbox
+score
+source_object_id
+evidence_span_id
+```
+
+### RenderedOutput
+
+```text
+output_id
+format
+profile
+content_hash
+source_doc_id
+parser_run_id
+created_at
+warnings[]
+anchors[]
+```
+
+### OutputAnchor
+
+```text
+anchor_id
+output_id
+evidence_span_id
+page
+bbox
+char_start
+char_end
+markdown_heading_path
+```
+
+### TrustUnit
+
+```text
+unit_id
+document_id
+unit_kind
+page
+bbox
+reading_order
+text
+source_object_id
+evidence_span_ids[]
+confidence
+warnings[]
+```
+
+### Page
+
+```text
+page_number
+width
+height
+text_layer_available
+image_hash
+layout_regions[]
+```
+
+### LayoutRegion
+
+```text
+region_id
+page_number
+kind
+bbox
+confidence
+reading_order
+model_run_id
+```
+
+### TableRegion
+
+```text
+table_id
+page_number
+bbox
+confidence
+cells[]
+html
+markdown
+model_run_id
+```
+
+### TableCell
+
+```text
+cell_id
+table_id
+row_start
+row_end
+col_start
+col_end
+bbox
+text
+confidence
+source_text_spans[]
+```
+
+### EvidenceSpan
+
+```text
+span_id
+source_id
+page
+line_start
+line_end
+char_start
+char_end
+bbox
+quote
+quote_hash
+layout_region_id
+table_cell_id
+confidence
+```
+
+### ParserRun
+
+```text
+parser_version
+preset
+backend
+models[]
+ocr_backend
+started_at
+duration_ms
+warnings[]
+```
+
+### ModelRun
+
+```text
+model_name
+model_version
+model_sha256
+model_license
+backend
+device
+precision
+confidence_threshold
+```
+
+## 10. Contract Tests To Lock
+
+These tests are inspired by Kreuzberg and Docling behavior, but they lock
+DocTruth contracts only. They must not copy implementation code or private test
+fixtures from either project.
+
+### `TrustDocumentContractTest`
+
+Locks the unified document model.
+
+```text
+PDF/DOCX/XLSX/CSV -> TrustDocument
+```
+
+Assertions:
+
+```text
+each source block has a stable id
+each source block has page provenance when the format can provide it
+each page-space bbox is normalized and valid
+reading_order_index is stable and monotonic within page/region scope
+headers, footers, and furniture are not silently merged into body text
+parser_run records backend, preset, version, warnings, and duration
+source_hash is stable for the same input bytes
+```
+
+Why this exists:
+
+Docling's central lesson is that downstream exports should come from a unified
+document representation. DocTruth's equivalent is `TrustDocument`.
+
+### `RenderedOutputContractTest`
+
+Locks the split between canonical truth and consumption views.
+
+Assertions:
+
+```text
+json_full is lossless for TrustDocument fields
+json_evidence preserves evidence spans, source ids, and parser/model metadata
+compact_llm is deterministic and materially smaller than json_full
+compact_llm preserves evidence ids, section hierarchy, table ids, bbox metadata, and warnings
+compact_llm file output uses an incremental writer path
+compact_llm source-map sidecars resolve compact text offsets back to units and evidence spans
+compact_llm benchmark metrics report size reduction, round-trip health, and source-map coverage
+markdown_clean has no inline bbox/provenance/internal ids
+markdown_clean plus source map can resolve back to evidence spans
+markdown_anchored includes stable evidence anchors
+markdown_review includes page markers and warnings
+html_review exposes bbox-compatible anchors
+html_review exposes page surfaces with page dimensions and image hashes
+html_review renders page-scoped visual bbox overlay nodes for units, tables, and cells
+render-pages writes deterministic page PNG artifacts and a hash-bound manifest
+review-package writes local static HTML review packages with page images and TrustDocument JSON
+plain_text contains readable text/table content without Markdown/evidence syntax
+source-map verification fails when rendered content or source hash changes
+Audit JSON includes source, canonical document, and evidence hashes
+Audit JSON can be signed or wrapped through the shared SDK SignatureProvider
+Audit JSON can be replay-verified against full TrustDocument JSON
+markdown output is GFM-compatible for tables, code fences, links, and escaping
+Markdown/HTML/plain/compact outputs preserve cross-format section parity
+clean markdown alone is never audit-grade
+same source hash + parser config produces byte-stable output
+```
+
+Why this exists:
+
+Docling treats JSON as lossless and Markdown/HTML as lossy exports. DocTruth
+keeps that idea but adds evidence source maps and audit gates.
+
+### `ReadingOrderContractTest`
+
+Locks layout correctness where basic PDF text extraction usually fails.
+
+Fixture classes:
+
+```text
+single_column_resume.pdf
+two_column_resume.pdf
+left_sidebar_resume.pdf
+right_sidebar_resume.pdf
+academic_two_column.pdf
+header_footer_noise.pdf
+rotated_page.pdf
+```
+
+Assertions:
+
+```text
+single-column body order is preserved
+two-column documents do not interleave unrelated columns
+sidebar metadata does not interrupt main-column work history
+section headings attach to the correct following body
+headers and footers are classified or warned, not repeated as body content
+ambiguous multi-column pages emit reading_order_uncertain
+```
+
+Why this exists:
+
+DocTruth's evidence chain is broken if a field cites text that was assembled in
+the wrong reading order.
+
+### `TableExtractionContractTest`
+
+Locks table structure and cell-level evidence.
+
+Fixture classes:
+
+```text
+bordered_table.pdf
+borderless_table.pdf
+merged_cell_table.pdf
+resume_skill_matrix.pdf
+invoice_line_items.pdf
+```
+
+Assertions:
+
+```text
+each TableRegion has page, bbox, confidence, and parser/model provenance
+each TableCell has row/column indexes
+merged cells preserve row_span and col_span
+table markdown does not lose row/column meaning when structure is reliable
+HTML fallback is used when Markdown cannot represent rowspan/colspan safely
+json_full keeps table structure as data, not only as text
+field citations can point to table cells
+low-confidence table structure emits table_structure_low_confidence
+```
+
+Why this exists:
+
+Kreuzberg's table contract is useful: table output should include cell-level
+row/column indexing, merged-cell support, and Markdown or JSON output. DocTruth
+adds the requirement that extracted fields cite the cell, not merely the page.
+
+### `CitationContractTest`
+
+Locks source grounding.
+
+Assertions:
+
+```text
+each EvidenceSpan has source_document_id, page, quote, quote_hash, and confidence
+quote can be re-matched against TrustDocument text
+bbox is inside page bounds when present
+visual claims require bbox or a severe warning
+table-derived claims include table_cell_id
+quote_anchor_failed prevents audit-grade status
+```
+
+Why this exists:
+
+Docling provenance points back to page and layout. DocTruth must go further by
+requiring quote rematch and evidence-grade citation semantics.
+
+### `AuditGateContractTest`
+
+Locks DocTruth's stricter product promise.
+
+Severe warnings that block audit-grade:
+
+```text
+reading_order_uncertain
+table_structure_low_confidence
+quote_anchor_failed
+bbox_missing_for_visual_claim
+model_sha_mismatch
+ocr_low_confidence
+```
+
+Assertions:
+
+```text
+strict extraction cannot become audit-grade with severe parser warnings
+non-severe warnings remain visible in audit JSON
+fallback from model-assisted mode to heuristic mode is recorded
+strict mode fails instead of silently falling back
+```
+
+Why this exists:
+
+Parser uncertainty must be visible. DocTruth should never convert uncertain
+layout into fake certainty.
+
+### `ModelRuntimeContractTest`
+
+Locks local model behavior.
+
+Assertions:
+
+```text
+lite preset does not download heavy models
+offline mode never performs network access
+model SHA mismatch fails or emits a severe blocking warning
+standard preset records model name, version, SHA, backend, device, and precision
+fallback_reason is recorded when model-assisted parsing is unavailable
+doctor reports model cache state, backend availability, and memory estimate
+doctor reports local OCR worker executable readiness, engine, fallback engine, timeout, and disabled state
+configured model workers receive manifest-defined local model descriptors and READY cache status
+```
+
+Why this exists:
+
+Kreuzberg's model manifest and model-cache behavior are strong product
+precedents. DocTruth needs the same operational clarity with stricter audit
+semantics.
+
+### `ParserApiContractTest`
+
+Locks developer-facing entrypoints.
+
+Assertions:
+
+```text
+parse from file path
+parse from bytes
+parse batch
+parse via streaming input
+parse with preset
+render markdown/json/html/audit outputs
+render large documents without materializing every output format in memory
+same document + same parser config -> stable TrustDocument hash
+unsupported formats fail with stable error codes
+sidecar crash maps to structured ParseException
+```
+
+Why this exists:
+
+Kreuzberg's file/bytes, single/batch, sync/async matrix is a good API-shape
+benchmark. DocTruth should keep its Java API idiomatic while covering the same
+workflow surface.
+
+### `HtmlPassthroughContractTest`
+
+Locks HTML input and HTML-to-Markdown conversion behavior.
+
+Assertions:
+
+```text
+HTML headings preserve hierarchy in TrustDocument and Markdown
+HTML tables preserve row/column structure when representable
+HTML links preserve href and label
+fenced code blocks are not flattened into prose
+brackets, pipes, and Markdown-sensitive characters are escaped safely
+HTML-to-Markdown conversion avoids lossy intermediate round-trips
+source map resolves Markdown ranges back to HTML source nodes where available
+```
+
+Why this exists:
+
+HTML documents should not lose structure just because DocTruth normalizes them
+through a document model. The renderer must preserve useful HTML semantics for
+LLM/RAG consumption.
+
+### `ChunkingContractTest`
+
+Locks LLM/RAG consumption.
+
+Assertions:
+
+```text
+chunks do not cross unrelated sections by default
+chunk metadata includes heading path, page, source ids, and evidence span ids
+table chunks preserve table identity
+caption/figure chunks preserve nearby context
+clean text chunks can resolve back through source map
+oversized chunks split without losing evidence anchors
+```
+
+Why this exists:
+
+Docling's chunking model preserves metadata for downstream AI workflows.
+DocTruth needs the same retrieval usefulness while keeping replayable evidence.
+
+## 11. Quality Gates
+
+### Evidence Gate
+
+An extraction is not audit-grade when:
+
+```text
+source span has no stable page anchor
+quote cannot be re-matched
+bbox is missing where visual evidence is required
+table field lacks table cell or row/column context
+OCR confidence is below threshold
+parser emitted severe layout warnings
+model SHA does not match the expected value
+```
+
+Parser audit packages must be tamper-evident at the SDK boundary:
+
+```text
+source_hash
+canonical_hash
+evidence_hash
+signature_provider_applied
+package_file_written_with_exact_signed_payload
+replay_verifier_checks_full_trust_document_json
+```
+
+The local replay verifier must compare Audit JSON against full TrustDocument
+JSON and fail on mismatched document id, source hash, canonical hash,
+audit-grade status, parser run metadata, evidence hash, or evidence payload.
+The CLI contract is:
+
+```text
+doctruth verify-audit
+```
+
+This SDK-level package signing does not by itself provide external
+timestamping, key rotation, notarization, legal hold, or WORM storage. Those
+remain separate enterprise/runtime milestones.
+
+### Parser Warnings
+
+Warnings must be structured and visible:
+
+```text
+reading_order_uncertain
+multi_column_ambiguous
+table_structure_low_confidence
+layout_low_confidence
+ocr_low_confidence
+bbox_missing
+header_footer_contamination
+section_boundary_uncertain
+model_unavailable_fallback
+markdown_anchor_missing
+markdown_table_lossy
+```
+
+No silent fallback from model-assisted parsing to heuristic parsing when the
+caller requested strict evidence.
+
+Current Rust runtime contract status:
+
+```text
+doctruth-runtime parse_pdf preset=table-lite
+TrustDocumentParser.parse(path, ParserPreset.STANDARD)
+TrustDocumentParser.parse(bytes, filename, ParserPreset.TABLE_LITE)
+TrustDocumentParser.parse(inputStream, filename, preset)
+TrustDocumentParser.parseBatch(paths, preset)
+doctruth parse --preset table-lite --format json
+```
+
+These entrypoints preserve the parsed output for local inspection while adding
+blocking `model_unavailable_fallback` warnings when the selected preset requires
+models that are not available under the current local/offline policy. Each
+missing required model must be represented by its own warning that includes the
+model identity and expected SHA-256, so audit/replay tools can distinguish
+missing layout, table, and OCR capabilities. The Rust runtime owns this
+fallback/audit contract for its protocol and all Java wrapper paths. It can
+route model-assisted presets to configured workers, including real RT-DETR/TATR
+artifact smokes and SLANeXT/OCR worker-protocol smokes, but it still does not
+execute ONNX, PaddleOCR/SLANeXT, RapidOCR, or MNN models in the Rust process
+itself. Java/PDFBox parser-quality code must not become a parallel model-worker
+implementation.
+
+Current Java-quality-core / Rust-shell status: the Rust runtime is no longer
+binary-only; its protocol entrypoints are callable through the
+`doctruth-runtime` library crate, while `src/main.rs` is a thin process wrapper.
+The product direction for this parity phase is: Java/OpenDataLoader-compatible
+parser core is current parser-quality default, Rust runtime is the shell for
+process lifecycle, model workers, resource accounting, benchmark packaging, and
+future parser modules after benchmark parity. Missing Rust runtime is an
+installation error for shell/model/benchmark behavior, not proof that the Java
+quality core is legacy.
+The path-first SDK parser exposes explicit backend selection:
+`DocTruth.withProvider(provider).parsePdf(path).withParser(preset).backend(AUTO)`
+uses the configured parser policy, `.backend(PDFBOX)` selects the Java quality
+core explicitly, and `.backend(SIDECAR)` requires a configured runtime shell.
+CLI parsing follows the same rule: Java quality core for current parser-quality
+work, Rust shell for process/model/benchmark behavior, explicit
+`--backend pdfbox` only for Java-core selection. Source install and release tarballs now ship
+`bin/doctruth-runtime`, and the `bin/doctruth` launcher exports
+`DOCTRUTH_RUNTIME_COMMAND` automatically when that same-directory runtime is
+present.
+
+Current implementation status: `doctruth-runtime` uses `pdf_oxide` for
+text-layer page extraction, text span bbox evidence, DocTruth-owned column-order
+post-processing, page MediaBox geometry, default rendered PNG page hashes,
+content-stream safety checks, and line-table/table-debug extraction. It reports
+`parserRun.pdfBackend.current = pdf_oxide` and `status = DEFAULT`. `lopdf` is
+not a runtime dependency or a default parser-core component.
+
+## 12. Evaluation Corpus
+
+The parser benchmark must include:
+
+```text
+simple single-column PDFs
+two-column resumes
+left-sidebar resumes
+right-sidebar resumes
+academic multi-column PDFs
+forms with key-value regions
+bordered tables
+borderless tables
+merged-cell tables
+scanned PDFs
+mixed text-layer + image PDFs
+documents with headers/footers
+documents with rotated pages
+```
+
+Every fixture should have expected outputs for at least:
+
+```text
+reading order
+section boundaries
+table cells
+field evidence anchors
+bbox overlays
+parser warnings
+```
+
+Corpus fixtures must be executable from a manifest, not only described in
+documentation. The manifest contract is:
+
+```text
+corpus name
+case name
+source fixture path
+or remote sourceUrl + sourceSha256
+sourceSha256 for every human-reviewed parser-accuracy case
+expected clean Markdown path
+expected TrustDocument JSON path
+minimum metric thresholds
+maximum metric thresholds for lower-is-better metrics
+paths resolved relative to the manifest file
+remote fixtures cached beside the manifest after SHA-256 verification
+missing fixtures fail with case-specific diagnostics
+each labeled case must include an expected TrustDocument JSON label
+```
+
+The manifest runner should reuse the same benchmark metrics and threshold gate
+as direct in-code benchmark cases. A generated fixture corpus is useful for
+regression protection. `scripts/smoke-doctruth-real-pdf-corpus.sh` now adds a
+small public W3C PDF fixture with a fixed SHA-256, a human-authored
+`TrustDocument` label, `kind: "human-labeled"` metadata, and required metric
+thresholds. This proves the remote-real-PDF human-labeled corpus path. Larger
+human-labeled multi-layout/OCR/table corpora are still required before claiming
+real-world parser accuracy, and those corpora should use
+`qualityProfile: "parser-accuracy"` coverage tags so a single easy fixture
+cannot satisfy the release gate.
+
+The generated parser-accuracy seed corpus smoke exists to keep this release
+gate executable in CI until those real-world labels are populated. It also
+asserts that case-level `labelId` and `tags` survive into CLI JSON output.
+
+Rust-first continuation status: `doctruth-runtime` now owns a native
+`benchmark_corpus` protocol command in addition to `parse_pdf`. The command
+loads manifest-relative source PDFs, expected clean Markdown, expected
+TrustDocument JSON labels, parser-accuracy label metadata, case `labelId` and
+`tags`, optional `sourceSha256` verification, required tag coverage, and metric
+minimums. Native metrics now include `reading_order_f1`,
+`quote_anchor_accuracy`, `bbox_coverage`, `bbox_iou`,
+`evidence_span_accuracy`, `table_cell_f1`, and `ocr_text_accuracy`; the
+expected-document metrics are computed against the checked-in
+`TrustDocument` JSON label for each case. Each corpus case can now declare
+`preset`, so model-assisted cases are measured through the same Rust
+model-worker handoff as direct `parse_pdf`. Human-reviewed parser-accuracy
+manifests require `labeling.minTotalCases` and per-case `sourceSha256`, and the
+Rust command rejects missing pins or SHA mismatches before parser metrics are
+accepted. Human-reviewed parser-accuracy manifests must also declare the core
+parser-quality metric set in `requiredMetrics`, so a broad corpus cannot pass
+while silently omitting bbox, table, OCR, or evidence-span quality gates. The
+same manifests must declare the core coverage tags `multi-layout`, `table`,
+`ocr`, `bbox`, and `source-map`, so required coverage cannot shrink to a single
+easy layout bucket. The Rust protocol also accepts `report_path` and writes the
+same `doctruth.parser-benchmark.report.v1` recorded report artifact shape used
+by the Java CLI `--report-out` path, with manifest, label/review metadata,
+manifest hash, threshold criteria, metrics, and per-case label/tag/source-hash
+evidence. The Rust protocol also accepts `verify_benchmark_report` with
+`report_path`, so runtime-produced recorded reports can be validated without
+rerunning the parser and without going back through the Java CLI.
+`scripts/smoke-doctruth-runtime-benchmark-corpus.sh` proves this path without
+the Java CLI by running a `table-lite` case through a configured worker. This
+migrates the corpus gate skeleton to Rust, but it is still a generated/local
+gate; real-world parser accuracy still requires broad human-reviewed fixtures
+and labeled real model/OCR quality evidence.
+
+Rust model-runtime migration status: `doctruth-runtime parse_pdf` now checks
+`DOCTRUTH_RUNTIME_MODEL_COMMAND` or `DOCTRUTH_MODEL_COMMAND` for model-assisted
+presets such as `table-lite`. When configured, Rust sends a JSON stdin request
+containing source path, source hash, preset, offline/download policy, and
+required model descriptors, then returns the worker's `TrustDocument` JSON.
+Invalid worker output fails with stable `MODEL_WORKER_FAILED` diagnostics.
+`scripts/smoke-doctruth-runtime-model-worker.sh` proves this path without the
+Java CLI. This moves the model-worker handoff into the Rust runtime. RT-DETR/
+TATR now have an opt-in Rust-runtime real-artifact entrypoint, and SLANeXT/OCR
+have Rust-runtime worker-protocol smokes plus generated real-route Rust runtime
+smokes. ADR 0011 accepts this worker boundary for v1: the runtime owns
+orchestration, manifests, request envelopes, validation, normalization, and
+benchmark execution, while ONNXRuntime, PaddleOCR/SLANeXT, RapidOCR, and MNN
+may execute in isolated local workers.
+
+The benchmark metrics include both parser-quality gates and LLM/replay output
+gates. `compact_llm_size_reduction` is computed as the UTF-8 byte reduction
+relative to `json_full`; `compact_llm_round_trip` must be `1.0` when the
+source-map-rendered compact text exactly matches `toCompactLlm()`; and
+`compact_llm_source_map_coverage` measures citeable units that can be resolved
+from compact source-map entries. `strict_warning_false_negative_rate` compares
+expected severe parser or unit-local warning codes from the labeled
+`TrustDocument` against actual severe warning codes and is enforced through the
+manifest's `maximums` gate. `section_boundary_f1` is enforced through normal
+manifest `minimums` and treats merged/missing heading boundaries as recall or
+precision loss. `evidence_span_accuracy` is also enforced through `minimums`
+and measures expected text-line coverage by actual evidence-bearing units. Each
+parsed case also records `parser_latency_ms`, `rss_peak_mb`, and
+`model_cache_size_mb`; corpus output reports aggregate `parser_latency_p50` and
+`parser_latency_p95` plus `compact_llm_size_reduction_min`; latency gates such
+as `parser_latency_p95` are enforced through `maximums` at the corpus aggregate
+level in both Java and Rust benchmark runners, and compact-corpus gates such as
+`compact_llm_size_reduction_min` are
+enforced through aggregate `minimums`. Resource metrics are per-case benchmark
+observations unless a worker/runtime reports stronger process-level peak memory.
+
+The CLI must expose this gate directly:
+
+```text
+doctruth benchmark-corpus
+doctruth benchmark-corpus --json
+doctruth benchmark-corpus --json --report-out parser-report.json
+doctruth benchmark-corpus --offline
+doctruth verify-benchmark-report parser-report.json
+```
+
+The command must be covered by a smoke script that creates generated PDF and OCR
+fixtures, writes expected Markdown and `TrustDocument` labels, verifies a
+passing corpus, verifies that generic threshold failures and OCR wrong-label
+failures exit non-zero with diagnosable metric names, and verifies that offline
+mode refuses uncached remote fixtures before any network request. Parser-accuracy
+runs should write a recorded report artifact with
+`reportFormat: doctruth.parser-benchmark.report.v1`, the resolved manifest path,
+`manifestSha256`, label/review metadata, copied `minimums`/`maximums`, actual
+`caseCount` and `casesPerTag` coverage, copied `coverageRequired`, computed
+`coverageSatisfied`, fixture-type coverage, OpenDataLoader-inspired behavior
+coverage, replay `validityInputs`, metrics, and per-case
+label/tag/fixture/behavior/source-hash/replay evidence. Manifests may also
+declare `externalEvaluations.opendataloader` pointing at an OpenDataLoader-style
+`evaluation.json`; reports then copy the evaluation reference under
+`externalEvaluations`, persist its SHA-256 and imported values under
+`externalMetrics.opendataloader`, and flatten NID, TEDS, MHS, and speed into
+`metrics.opendataloader_nid`, `metrics.opendataloader_teds`,
+`metrics.opendataloader_mhs`, and `metrics.opendataloader_speed` for normal
+threshold gates. This is an imported parser-quality signal only: OpenDataLoader
+schemas are not canonical, and TrustDocument remains the evidence/replay
+contract. The adapter can also export OpenDataLoader Bench-style prediction
+artifacts to an explicit output directory: `markdown/.md` files
+and `summary.json`, with `externalArtifacts.opendataloaderPrediction` recording
+the artifact path, engine, and document count. These artifacts are for external
+evaluator compatibility only; they do not replace TrustDocument or parser trace
+evidence. Fixture taxonomy is
+declared with `requiredFixtureTypes`, `minCasesPerFixtureType`, case
+`fixtureTypes`, `casesPerFixtureType`, `fixtureCoverageRequired`, and
+`fixtureCoverageSatisfied`; recorded reports also include `fixtureResults`,
+which lists each fixture/layout bucket's case count, cases, aggregate metrics,
+and pass/fail status against copied thresholds. It covers simple single-column,
+two-column, sidebar-resume, table, borderless-table, scanned-OCR, invoice, and
+mixed-layout fixtures. Behavior taxonomy is declared with `requiredBehaviors`,
+`minCasesPerBehavior`, case `behaviors`, `casesPerBehavior`,
+`behaviorCoverageRequired`, and `behaviorCoverageSatisfied`; it covers
+OpenDataLoader-inspired XY-Cut edge cases, parser safety filters,
+structure-tree preference, and table border/cluster heuristics. `validityInputs`
+must state whether the recorded report can be replayed from source hashes,
+manifest hash, parser configuration, model/cache manifest state, thresholds,
+expected labels, and the actual `TrustDocument` output. Each case must include a
+`replay` object for `sourceRefReplayable`, `quoteReplayable`, and
+`evidenceSpanReplayable`, plus the actual `TrustDocument` output and
+`actualTrustDocumentSha256` so the recorded report can prove its parser-quality
+and replay claims are bound to the real parsed document, not only copied
+metrics.
+
+Current OpenDataLoader Bench runner status: `scripts/run-doctruth-opendataloader-bench.sh`
+builds `doctruth-runtime`, runs Rust `opendataloader_prediction` over the
+vendored `third_party/opendataloader-bench/pdfs/` corpus, writes
+`prediction/doctruth-runtime/markdown/*.md`, per-document `cases/*.json`,
+per-document `failures/*.json`, `summary.json`, `resources.json`, and
+`prediction-report.json`, and then runs Rust `opendataloader_evaluate_prediction`
+by default to produce `evaluation.json`. Successful runs leave `failures/`
+empty and never write a root `errors.json`. The official upstream OpenDataLoader
+Python evaluator remains available only through explicit `--evaluator official`
+or oracle/baseline scripts; it is not the default DocTruth prediction/evaluation
+path. `scripts/smoke-doctruth-opendataloader-evaluator-parity.sh` provides a
+skip-safe fixture-level parity smoke between the Rust evaluator and the official
+upstream evaluator for exact text, heading-level normalization, and table
+wrapper/header normalization. This is not yet a full-corpus proof that the Rust
+evaluator can replace the official oracle for all APTED/lxml/rapidfuzz edge
+cases. Legacy Python/OpenDataLoader hybrid baseline scripts are fail-closed and
+require `DOCTRUTH_ALLOW_PYTHON_ORACLE=1` before launching the heavy oracle path.
+The legacy Python prediction adapter also refuses direct command-line execution
+without the same opt-in; importing it from legacy smoke tests remains a test
+helper boundary. Even `--evaluator official` is fail-closed behind the opt-in so
+the Python/APTED/lxml/rapidfuzz upstream evaluator cannot be launched by
+accident. The default Rust runner and MNN promotion runner must not call the
+Python prediction adapter. The first full local baseline on 200 vendored PDFs parsed 199
+documents and failed one scanned/no-text-layer document. It reported
+`overall_mean=0.509092484964239`, `nid_mean=0.7591850124827885`,
+`teds_mean=0.0`, and `mhs_mean=0.0025571766718785185`, with
+`total_elapsed=389.71747279167175` seconds and one extreme slow sample
+`01030000000141` at about 180 seconds.
+
+The first export-layer optimization adds conservative Markdown heading
+promotion, TrustDocument table-to-HTML rendering, and a narrow line-span table
+fallback for `No.`/number/name/value table patterns. The next full local run
+still parsed 199 of 200 documents, but improved the OpenDataLoader aggregate to
+`overall_mean=0.5492221210080162`, `nid_mean=0.7665022379711967`,
+`teds_mean=0.06498004117639267`, and `mhs_mean=0.12239636974611434`.
+This is an honest baseline, not a pass gate: reading order has a usable
+text-layer foundation, while table fidelity, heading hierarchy, OCR fallback,
+and slow-sample timeout/parallelism remain required parser-quality work before
+DocTruth can claim OpenDataLoader/Docling level extraction quality.
+
+OpenDataLoader parity is measured, not asserted. A behavior is considered
+ported only when it has a Rust contract test, an upstream source reference, and
+either a focused OpenDataLoader Bench case or a full200 report showing the
+effect. Until full200 reaches the accepted baseline, DocTruth should be
+described as OpenDataLoader-inspired and progressively porting parity, not
+OpenDataLoader-equivalent.
+
+The current recorded full200 baseline is
+`docs/parser/reports/opendataloader-full200-2026-06-23.md` with 200 documents,
+199 parsed, 1 failed, `overall_mean=0.738756`, `nid_mean=0.859061`,
+`teds_mean=0.475822`, and `mhs_mean=0.469231`. The paired comparison report is
+`docs/parser/reports/opendataloader-hybrid-comparison-2026-06-23.md`; it covers
+the same 200 documents as the OpenDataLoader hybrid baseline and records a
+remaining delta of `overall=-0.167816`, `nid=-0.074670`, `teds=-0.451821`, and
+`mhs=-0.351545`. This means the current gap is primarily quality, especially
+tables and heading hierarchy, not corpus mismatch.
+
+The Rust-owned runner supports `--timeout-seconds` without returning to the
+Python prediction adapter. When this option is present, `opendataloader_prediction`
+spawns the current `doctruth-runtime` binary per document, sends a normal
+`parse_pdf` request over stdin, kills the child on timeout, writes an empty
+Markdown artifact, and records `errorCode=PARSE_TIMEOUT` in `summary.json` and
+the affected document's `failures/.json`. Without this option,
+prediction stays on the faster in-process Rust path. Historical context: the
+legacy Python adapter used the same kind of
+per-document isolation to keep full-corpus iteration from being dominated by a
+single pathological PDF; a 30-second run completed in `239.5388069152832`
+seconds, marked `01030000000141` as timed out, kept the scanned/no-text-layer
+failure `01030000000165`, and retained nearly identical aggregate quality:
+`overall_mean=0.549140667373931`, `nid_mean=0.7663393307030263`,
+`teds_mean=0.06498004117639267`, and `mhs_mean=0.12239636974611434`.
+
+Current structure-tree preference status: the Rust runtime now asks `pdf_oxide`
+for canonical page reading order, which prefers a trustworthy Tagged-PDF
+`/StructTreeRoot` before geometric inference. `parserRun.readingOrder` and
+`parseTrace.readingOrder` record whether the chosen source is `structure-tree`
+or fallback `xy-cut`. When a tagged PDF sets `/MarkInfo /Suspects true`, the
+runtime falls back to XY-Cut and emits a non-severe
+`structure_tree_suspect_fallback` warning. This proves the reading-order
+preference and replay trace boundary; richer role/heading/list/table semantic
+export from tags remains a later parser-quality expansion.
+
+Current table-migration status: borderless/text-spatial table extraction uses
+`pdf_oxide` `detect_tables_from_spans` and normalizes the result through
+DocTruth `TrustDocument` table cells. Bordered-grid, merged-cell, row-span, and
+adjacent-page continuation extraction now use `pdf_oxide` content-stream
+primitives. `lopdf` is no longer a `doctruth-runtime` dependency or default
+parser-core component. This completes the Rust MVP table migration while broad
+real-world table accuracy and model-assisted calibration remain parser-quality
+follow-ups.
+
+Current parser-safety status: the Rust runtime has OpenDataLoader-style
+content-safety filters for duplicate positioned text, whitespace-only spans,
+off-page spans, tiny spans, near-white/background-like spans, and invisible
+render-mode text. These filters emit severe warnings such as
+`duplicate_text_filtered`, `whitespace_text_filtered`, `off_page_text_filtered`,
+`tiny_text_filtered`, `background_text_filtered`, and `hidden_text_filtered`,
+then mark the parse `NOT_AUDIT_GRADE`. Robust rendered-page background
+comparison remains a later parser-quality expansion, not a default parser-core
+blocker.
+The CLI must also verify a recorded report without rerunning the parser, so CI
+can prove that an archived parser-quality report still matches its manifest,
+thresholds, coverage counts, copied coverage requirements, metric values, and
+source pins. Recorded reports must also prove that aggregate metrics are
+consistent with the per-case metrics they summarize, that coverage satisfaction
+matches actual case tags, fixture types, and behavior tags, that replay validity
+inputs remain present, that imported OpenDataLoader metrics still match the
+referenced `evaluation.json` and its hash, and that case replay fields match the
+metrics/source hashes they summarize. They must also recompute each case's
+`actualTrustDocumentSha256` from the embedded `actualTrustDocument` and replay
+case-level parser-quality metrics against the manifest's expected Markdown and
+expected `TrustDocument` labels, so a report cannot be altered by changing only
+the aggregate, only external metrics, only coverage fields, only case-level
+replay evidence, only the parser output hash, or only the embedded parser
+output.
+Cached remote
+fixtures remain usable offline after SHA-256 verification.
+`scripts/smoke-doctruth-real-ocr-corpus.sh` is an opt-in runtime corpus smoke:
+when `DOCTRUTH_REAL_OCR_CORPUS_SMOKE=1` is set, it installs or reuses an
+isolated RapidOCR + ONNXRuntime environment, verifies the RapidOCR worker
+doctor, generates a scanned-PDF fixture, and gates `ocr_text_accuracy` through
+`benchmark-corpus`. This proves the real OCR runtime can feed the corpus gate on
+a generated scanned fixture, not broad real-world OCR accuracy.
+
+## 13. Architecture
+
+### Phase Architecture
+
+```text
+Java API
+ |
+ | existing Java ParsedDocument / Citation compatibility
+ | new TrustDocument contract
+ v
+Rust Runtime Adapter
+ |
+ +-- Rust core native binding
+ |
+ +-- Rust sidecar process
+ |
+ +-- Java PDFBox compatibility/oracle mode
+ only when explicitly selected for migration and differential tests
+
+Rust core
+ |
+ +-- text layer parser
+ +-- page rasterizer
+ +-- layout detector
+ +-- table recognizer
+ +-- OCR backend
+ +-- model/cache verifier
+ +-- benchmark corpus runner
+ +-- evidence reconciler
+ +-- TrustDocument emitter
+```
+
+The dependency direction must stay one-way: Java calls Rust; Rust does not
+depend on Java parser internals.
+
+### Why Sidecar First
+
+Sidecar is the safest first bridge:
+
+```text
+no JNI packaging complexity at the beginning
+crash isolation
+easier model cache management
+same runtime usable by CLI and MCP
+Java SDK can keep stable contracts
+```
+
+Native binding can come after contracts stabilize.
+
+## 14. TDD Execution Mode
+
+This PRD should be implemented with milestone-sized batch TDD, not with one
+micro-feature per loop and not with the entire PRD as one giant failing test
+set.
+
+For each milestone:
+
+```text
+1. derive the concrete contract from this PRD
+2. write all RED tests for that milestone first
+3. run the focused test set and confirm failures are caused by missing behavior
+4. implement the milestone in one coherent development pass
+5. rerun focused tests
+6. rerun required smoke tests
+7. update PRD/planning status with what is proven and what remains unproven
+```
+
+Milestone scope should be large enough to avoid thrashing, but small enough
+that failures remain diagnosable. Good milestone boundaries are:
+
+```text
+signed audit package and replay package integrity
+labeled parser benchmark corpus harness
+model runtime interface and cache/fallback contracts
+layout-region detection contract
+table-region and cell-recognition contract
+OCR routing and low-confidence warning contract
+HTML review overlay/source-map contract
+streaming parse/render contract
+```
+
+Do not batch unrelated hard problems into one milestone. For example,
+model-assisted layout detection, OCR, external notarization, and WORM/legal
+hold are separate milestones even though they all support audit readiness.
+
+Completion requires current evidence, not intent:
+
+```text
+focused unit tests for the milestone
+public API snapshot update when public surface changes
+CLI or runtime smoke when user-facing behavior changes
+full Maven test suite when Java contracts change
+Cargo tests and runtime smoke when Rust runtime changes
+git diff --check
+```
+
+If a milestone only writes partial scaffolding, mark it as scaffolding. Do not
+claim parser quality, replay completeness, or audit-grade readiness unless the
+tests and smoke prove that specific claim.
+
+## 15. Implementation Phases
+
+### Phase 0: Contract Freeze
+
+Deliverables:
+
+```text
+TrustDocument v1 draft
+ContentBlock projection contract
+ParseTrace intermediate evidence contract
+LayoutRegion contract
+TableRegion/TableCell contract
+EvidenceSpan contract
+ParserRun/ModelRun metadata
+strict parser warning taxonomy
+```
+
+Exit criteria:
+
+```text
+old Java API remains source-compatible
+new contracts can represent current parser output
+audit JSON can include parser/model metadata
+clean Markdown/content blocks are derived from the canonical parse
+parse trace can represent page/block/line/span observations
+```
+
+### Phase 0A: Layered Parser Output Contract
+
+Deliverables:
+
+```text
+markdown_clean profile
+content_blocks.json profile
+parse_trace.json profile
+trust.json profile
+content block source-unit/evidence-span links
+parse trace page/block/line/span ids
+discarded block trace contract
+layout/span debug artifact contract
+```
+
+Exit criteria:
+
+```text
+content_blocks.json preserves reading order without inline evidence noise
+parse_trace.json preserves page/block/line/span/bbox/source refs
+clean Markdown can be regenerated from content blocks
+TrustDocument evidence spans can be traced back to parse trace spans
+visual debug artifacts can be generated from the same trace ids
+```
+
+### Phase 1: Java Baseline Hardening
+
+Deliverables:
+
+```text
+multi-column section regression suite
+sidebar/main-column fixtures
+table fixture suite
+header/footer contamination tests
+parser warnings
+evidence gate integration
+```
+
+Exit criteria:
+
+```text
+current PDFBox path fails visibly instead of silently
+known cross-column bugs are covered by tests
+all current unit tests pass
+```
+
+### Phase 2: Rust Sidecar MVP
+
+Deliverables:
+
+```text
+doctruth-runtime binary
+JSON stdin/stdout protocol
+streaming parse protocol
+parse_pdf command
+benchmark_corpus command
+configured model-worker handoff
+doctor command
+model cache directory
+SHA256 verification
+Java sidecar adapter
+CLI adapter
+```
+
+Exit criteria:
+
+```text
+Java SDK can call sidecar parser
+CLI can use the same runtime
+sidecar crash returns structured ParseException
+model cache can be verified offline
+```
+
+### Phase 3: Layout Detection
+
+Deliverables:
+
+```text
+ONNX runtime integration
+RT-DETR-compatible layout model adapter
+layout region output
+reading-order reconciliation
+confidence thresholds
+CoreML/CUDA provider detection where available
+```
+
+Exit criteria:
+
+```text
+layout regions are visible in TrustDocument JSON
+multi-column reading order improves on benchmark corpus
+low-confidence layout emits warnings
+```
+
+Current status: ONNXRuntime loading, RT-DETR/DETR-like layout output decoding,
+confidence warnings, and resource metrics are covered by synthetic ONNX smokes.
+`scripts/smoke-doctruth-real-rtdetr-artifact.sh` now validates a public
+document-layout RT-DETR artifact from `Kreuzberg/layout-models` through the
+same cache/model-worker/parse path. It proves rendered-page input,
+`orig_target_sizes`, `labels`/`boxes`/`scores` decoding, and Java CLI
+integration. The repository still does not bundle RT-DETR weights by default
+or claim broad document-layout accuracy without labeled corpus results.
+
+### Phase 4: Table Recognition
+
+Deliverables:
+
+```text
+TATR-compatible table model adapter
+small table model preset
+server table model preset
+table cell reconstruction
+cell-level EvidenceSpan
+HTML/Markdown/JSON table output
+```
+
+Exit criteria:
+
+```text
+table fields cite cells, not only page-level blocks
+merged cells preserve row/col span
+borderless table fixtures improve over heuristic baseline
+```
+
+Current status: Java/PDFBox now recovers generated bordered-grid tables, a
+conservative class of borderless aligned text matrices, generated bordered
+tables with horizontal merged cells, and generated bordered tables with vertical
+row spans into `TrustTable`, `TrustTableCell`, and `TABLE_CELL` units with
+normalized bboxes. Generated merged cells preserve `rowRange`/`columnRange` span
+data and are gated by `table_cell_f1` in generated PDF benchmark fixtures.
+Java/PDFBox also now merges adjacent generated bordered-table continuation
+pages with repeated headers, dedupes the continuation header, and keeps
+continued cell units on their original source page. The Rust
+`doctruth-runtime` now has parity for generated bordered-grid tables, short
+aligned borderless text matrices, generated horizontal merged cells, and
+generated vertical row spans through content-stream text points. It now also
+merges adjacent generated bordered-table continuation pages with repeated
+headers, dedupes the continuation header, and keeps continued `TABLE_CELL`
+units on their original source page. Explicit Cargo contract tests, runtime
+smoke, and Java CLI sidecar smoke cover these JSON paths. A separate Java CLI
+sidecar borderless smoke also covers JSON, Markdown, and plain-text rendering.
+`TABLE_LITE` also has a configurable model-worker path that can return
+model-produced tables through the same `TrustDocument` contract and CLI JSON
+smoke. Its request now supports manifest-defined local model descriptors and
+SHA-verified READY cache artifacts. The opt-in real model artifact smoke can be
+run with `DOCTRUTH_REAL_MODEL_EXPECTED_TASK=table-structure-recognition` to
+validate user-supplied TATR/SLANeXT-compatible ONNX artifacts through the same
+cache/model-worker/parse path. `scripts/smoke-doctruth-real-tatr-artifact.sh`
+now validates the public Xenova Table Transformer quantized ONNX artifact
+through that same cache/model-worker/parse path and also verifies the direct
+worker uses a rendered PDF page as model input. The smoke now also exercises
+the real TATR row/column label set and requires multi-row/multi-column
+intersected cell output. This is still mostly heuristic/generated-fixture table
+support until labeled real-world table accuracy and additional production table
+models are checked in or supplied by CI. `doctruth-slanext-table-worker`
+provides the PaddleOCR/SLANeXT adapter boundary for the `table-server` path, and
+`scripts/smoke-doctruth-slanext-table-worker.sh` covers that protocol with a
+fake PaddleOCR module. The real SLANeXT smoke remains opt-in because the generic
+DocTruth package must not bundle PaddleOCR/Paddle/model binaries by default.
+It has been verified with PaddleOCR 3.7.0/PaddlePaddle 3.3.1 in an isolated
+Python 3.10 environment. `doctruth-runtime` can now route `table-server`
+through the SLANeXT worker protocol, and the generated real PaddleOCR/SLANeXT
+smoke has now been recorded through that Rust-runtime route.
+The packaged `smoke-doctruth-real-model-suite.sh` combines RT-DETR, TATR, and
+SLANeXT runtime smokes so release jobs can run the same model gate instead of
+calling each script manually.
+
+### Phase 5: OCR Routing
+
+Deliverables:
+
+```text
+text-layer quality detector
+OCR backend interface
+page image rasterization
+OCR text + bbox reconciliation
+OCR confidence gate
+```
+
+Current status: the Rust runtime owns page image hashes through `pdf_oxide`
+rendering by default, while Java page-image helpers remain package/review
+compatibility utilities. `doctruth review-package` bundles review HTML,
+TrustDocument JSON, and page image artifacts into a single local directory.
+`ParserPreset.OCR` now routes v1 `TrustDocumentParser` and CLI TrustDocument
+outputs through the Rust runtime and configured local OCR/model-worker
+protocol (`DOCTRUTH_RUNTIME_MODEL_COMMAND`, `DOCTRUTH_OCR_COMMAND` /
+`doctruth.ocr.command`, default engine `mnn`) and marks recovered units as
+`OCR_REGION` with `rust-sidecar+model-worker` parser provenance when the Rust
+runtime route is used. OCR page confidence is propagated into
+`TrustUnitEvidence`; confidence below `0.85` emits a severe
+`ocr_low_confidence` warning on the unit and makes the
+document `NOT_AUDIT_GRADE` while still preserving the recovered text for
+review/replay.
+The generic jar still does not bundle RapidOCR/MNN models, and the raw local
+`rapidocr` CLI is not treated as verified unless wrapped behind the worker
+protocol. `doctruth-rapidocr-mnn-worker` now provides that wrapper and is
+packaged with the CLI. It also provides `--doctor` readiness JSON so
+`doctruth doctor --json` can distinguish an executable worker from a RapidOCR
+runtime that can actually import and initialize. On the current development
+machine the default global Python/RapidOCR environment still reports
+`rapidocr_unavailable` because its NumPy install is incompatible, but an
+isolated RapidOCR + ONNXRuntime backend smoke now passes and proves direct
+worker OCR plus Java CLI scanned-PDF OCR. An opt-in real OCR corpus smoke now
+uses the same RapidOCR worker behind `benchmark-corpus` and gates
+`ocr_text_accuracy` on a generated scanned-PDF label. Strict MNN doctor mode now
+requires a real importable `MNN`/`mnn` backend module before reporting backend
+readiness; the CLI release smoke also verifies this field contract with a fake
+backend module. Rust runtime page image hash parity is now covered by
+`pdf_oxide` rendered PNG runtime tests, and `doctruth-runtime` can route `ocr`
+through the RapidOCR worker protocol.
+Persisted Rust page image artifact output, real MNN OCR recognition quality,
+and labeled real-world OCR accuracy remain separate work.
+
+Exit criteria:
+
+```text
+scanned PDFs produce evidence spans
+low-confidence OCR cannot become audit-grade silently
+OCR output is replayable through ParserRun metadata
+```
+
+### Phase 6: MCP/Skill Distribution
+
+Deliverables:
+
+```text
+doctruth MCP server
+skill package
+runtime bootstrap
+doctor checks
+document evidence tools
+model cache warmup
+compact_llm wire output
+GFM-quality Markdown renderer
+HTML passthrough renderer path
+```
+
+Exit criteria:
+
+```text
+an agent can parse a document through MCP
+the response includes evidence spans and bbox references
+MemTruth can store DocTruth evidence as replayable source objects
+LLM-facing output is compact, deterministic, and source-map resolvable
+```
+
+Current status: `doctruth mcp` now provides a local stdio MCP gateway with
+`initialize`, `tools/list`, and `tools/call` support for
+`doctruth.parse_document`, `doctruth.get_layout_regions`,
+`doctruth.get_table_cells`, `doctruth.get_evidence_span`, and
+`doctruth.verify_citation`, plus `doctruth.warm_model_cache` for local model
+cache preflight. The document tools parse a local document through the v1
+`TrustDocumentParser` contract and return MCP `structuredContent` containing
+compact LLM text, JSON evidence units, bbox-bearing layout regions, table cell
+bboxes, citation verification, audit status, source hash, and source-map
+entries. The model cache tool verifies caller-supplied local model descriptors
+against a cache directory and reports READY/MISSING/SHA_MISMATCH without
+implicit downloads. A packaged smoke verifies the shaded CLI can parse
+generated PDFs through MCP and return evidence spans, bbox references, table
+cells, citation verification, and model cache readiness. A local skill package
+now lives under
+`skills/doctruth/` with a concise `SKILL.md`, agent metadata, and a bootstrap
+script that writes a stdio MCP config pointing to `doctruth mcp`; a smoke test
+verifies the package and config writer. This is still a local single-user
+stdio gateway; remote/distributed MCP deployment remains outside this slice.
+
+The standalone CLI also now supports `doctruth cache warm
+--preset [--cache ] [--offline] [--json]`. It installs
+manifest-defined local, `file://`, or HTTP(S) model artifacts into the
+deterministic cache filename, then verifies SHA-256 with the shared cache
+verifier. Remote downloads stream through JDK `HttpClient` into a temp file
+before entering the cache, and `--offline` refuses remote model sources before
+any network request. This establishes the install/preflight contract for future
+real ONNX/TATR/SLANeXT model artifacts. Manifest runtime hints are preserved
+through cache, doctor, and worker JSON so a later real model worker can
+distinguish layout detection, table structure, backend, format, precision, and
+license requirements. Curated real model URLs and production execution are
+still not implemented for RT-DETR/TATR/SLANeXT, but ONNXRuntime smokes now
+prove the local ONNX execution boundary plus synthetic RT-DETR/DETR-like
+layout and TATR/DETR-like table decoder contracts over `pred_logits`/
+`pred_boxes`.
+
+## 16. Acceptance Metrics
+
+Minimum parser benchmark gates for a beta runtime:
+
+```text
+single-column reading_order_f1 >= 0.98
+two-column reading_order_f1 >= 0.92
+section_boundary_f1 >= 0.90
+table_region_iou >= 0.85
+table_cell_f1 >= 0.80 for standard
+quote_anchor_accuracy >= 0.97
+bbox_iou >= 0.80 for cited visual spans
+strict parser warning false-negative rate <= 2%
+```
+
+Runtime gates:
+
+```text
+lite p95 parse latency <= 1.5s for 3-page text-layer PDF
+standard p95 parse latency <= 8s CPU for 3-page PDF
+large-document streaming path avoids loading all pages and all rendered outputs into memory at once
+compact_llm output is at least 25% smaller than json_full on the benchmark corpus
+GFM renderer preserves fenced code blocks, tables, links, and bracket escaping
+HTML passthrough avoids lossy intermediate conversion for HTML sources
+model cache verifies SHA256 before use
+offline mode never attempts network download
+local OCR worker readiness is reported by doctor
+sidecar RSS and peak model memory are reported by doctor
+ONNX worker parse response reports wall time, inference time, RSS, and peak memory
+```
+
+Current status: the Rust sidecar `--doctor` response now reports `rssMb` and
+`peakMemoryMb` from local process memory without adding runtime dependencies.
+The Rust protocol contract and runtime smoke assert these fields. With no model
+loaded, `peakMemoryMb` represents process high-water or RSS fallback rather than
+production model peak memory.
+
+## 17. Open Questions
+
+```text
+Which Apache/MIT-compatible model artifacts can be redistributed or referenced?
+Should DocTruth ship model download manifests or only model adapters?
+Should table-server presets live in OSS, or only as optional user-supplied models?
+Should embedded native/JNI runtime replace the sidecar as the default once the Rust library core is mature?
+What is the minimum fixture corpus size before claiming parser-runtime alpha?
+Should compact_llm use an existing TOON-compatible syntax or a DocTruth-owned compact evidence format?
+Which Rust Markdown renderer should be the default for GFM parity?
+Should DocTruth keep `pdf_oxide` as the default OSS Rust PDF backend, or support a secondary PDFium-compatible backend only for specific enterprise/runtime environments?
+```
+
+## 18. Product Boundary
+
+DocTruth parser runtime owns:
+
+```text
+document parsing
+layout detection
+OCR routing
+table structure recognition
+source grounding
+evidence spans
+parser/model provenance
+audit-grade gating
+```
+
+DocTruth parser runtime does not own:
+
+```text
+agent memory
+long-term replay ledger
+general RAG retrieval
+hosted team review workflow
+business-domain extraction templates
+```
+
+MemTruth consumes DocTruth evidence. It should not re-parse documents when
+DocTruth can provide source-grounded evidence spans.
diff --git a/docs/plans/2026-06-17-parser-quality-replication-plan.md b/docs/plans/2026-06-17-parser-quality-replication-plan.md
new file mode 100644
index 00000000..42357a40
--- /dev/null
+++ b/docs/plans/2026-06-17-parser-quality-replication-plan.md
@@ -0,0 +1,355 @@
+# Parser Quality Replication Plan
+
+Date: 2026-06-17
+
+## Current Truth
+
+The OpenDataLoader Bench runner is now real enough to show that DocTruth parser
+quality is still behind the strongest references. The latest optimized timeout
+run on the vendored 200-PDF corpus produced:
+
+| Engine | Overall | NID | TEDS | MHS |
+| --- | ---: | ---: | ---: | ---: |
+| DocTruth `doctruth-runtime-optimized-timeout` | 0.549 | 0.766 | 0.065 | 0.122 |
+| OpenDataLoader | 0.831 | 0.902 | 0.489 | 0.739 |
+| Docling | 0.882 | 0.898 | 0.887 | 0.824 |
+| OpenDataLoader hybrid | 0.907 | 0.934 | 0.928 | 0.821 |
+
+After the first replication pass, DocTruth has a measurable export-layer lift
+but is still far from reference parity:
+
+| Engine | Overall | NID | TEDS | MHS |
+| --- | ---: | ---: | ---: | ---: |
+| DocTruth `doctruth-runtime-replication-pass2` | 0.563 | 0.739 | 0.188 | 0.196 |
+
+Pass2 is better than `doctruth-runtime-optimized-timeout` on overall score,
+TEDS, and MHS, but it still loses NID and does not reproduce OpenDataLoader or
+Docling quality. The pass2 work should be treated as a diagnostic and export
+compatibility lift, not as completed parser-core parity.
+
+This means the current gap is not only Markdown rendering. The largest missing
+quality is:
+
+- table reconstruction: `TEDS 0.065` versus `0.489-0.928`
+- heading hierarchy: `MHS 0.122` versus `0.739-0.824`
+- reading order/text normalization: `NID 0.766` versus `0.898-0.934`
+
+The previous OpenDataLoader-inspired Rust slices ported useful local behavior,
+but they did not reproduce the complete parser-quality pipeline. Do not treat
+the XY-Cut++, filter, or export-layer slices as quality parity.
+
+## Reference Pipelines
+
+### OpenDataLoader Base
+
+The benchmark adapter runs `opendataloader_pdf.convert(...)` or its JAR with:
+
+```text
+format = markdown
+table_method = cluster
+image_output = off
+quiet = true
+```
+
+This is the target for the first parity milestone because it is Apache-2.0,
+fast, and has published bench output.
+
+### OpenDataLoader Hybrid
+
+The hybrid reference starts `opendataloader_pdf.hybrid_server` and runs:
+
+```text
+hybrid = docling-fast
+format = markdown
+image_output = off
+```
+
+This is not a single Rust heuristic. It is a composition of OpenDataLoader's
+layout/table/export path with Docling-assisted handling for hard cases. Treat
+it as the high-accuracy target, not the first Rust-core baseline.
+
+### Docling
+
+The benchmark Docling runner uses:
+
+```text
+DocumentConverter().convert(pdf).document.export_to_markdown()
+```
+
+Docling is a strong reference for unified document modeling, table output, and
+heading hierarchy. It should be used as a reference/oracle in evaluation and
+triage, not as DocTruth's canonical schema.
+
+## Canonical Boundary
+
+DocTruth's canonical output remains:
+
+```text
+TrustDocument
+content_blocks.json
+parse_trace.json
+clean Markdown + source map
+audit/review package
+```
+
+External parser outputs are observations only. No external Markdown, Docling
+document, OpenDataLoader result, or hybrid output becomes canonical until it is
+normalized into `TrustDocument` and replayable evidence anchors.
+
+Java/PDFBox remains wrapper, compatibility, and differential-oracle surface
+only. Parser-quality work belongs in `runtime/doctruth-runtime`.
+
+## Why Quality Is Still Low
+
+The current DocTruth optimized run mostly emits text-layer line spans and
+export-layer guesses. That helps narrative text but fails the main benchmark
+metrics:
+
+1. Tables are often not detected as structured tables, so TEDS is near zero.
+ Export fallbacks fix simple cases but cannot recover complex rowspan,
+ colspan, multi-header, or continuation tables.
+2. Heading promotion is heuristic and not tied to a real section tree. MHS
+ stays low because Markdown heading levels and heading/content grouping are
+ wrong or missing.
+3. Reading order still needs stronger paragraph joining, dehyphenation,
+ header/footer/page-number suppression, tagged-structure trust scoring, and
+ multi-column/sidebar ordering across real PDFs.
+4. Scanned/no-text PDFs still need real OCR routing in the benchmark path.
+5. We do not yet have an automated per-case diff loop that compares DocTruth,
+ OpenDataLoader, Docling, and ground truth by failure category.
+
+## Replication Strategy
+
+### Phase A: Reference Oracle Harness
+
+Status: complete for local vendored artifacts.
+
+Build a dev-only reference lane that can run or consume:
+
+- OpenDataLoader base predictions
+- OpenDataLoader hybrid predictions
+- Docling predictions
+- DocTruth predictions
+- ground-truth Markdown
+
+The harness should produce per-document comparison records:
+
+```text
+document_id
+fixture type
+DocTruth scores
+OpenDataLoader scores
+Docling scores
+metric deltas
+top failing metric
+failure bucket
+paths to GT/prediction Markdown
+paths to TrustDocument/content_blocks/parse_trace when available
+```
+
+Done when the report can answer: "which 20 PDFs lose the most score, and why?"
+
+### Phase B: Metric-Specific Triage
+
+Status: complete for local vendored artifacts.
+
+Classify failures by the metric they damage:
+
+| Metric | Failure buckets |
+| --- | --- |
+| NID | bad reading order, broken paragraph join, duplicated text, missing text, header/footer noise, soft hyphen artifacts |
+| TEDS | table missed, row split wrong, column split wrong, rowspan/colspan missing, HTML/GFM rendering mismatch, table continuation missed |
+| MHS | title missed, heading level wrong, heading text noisy, heading/content association wrong, false heading promotion |
+| Speed/resource | slow page, timeout, worker startup cost, OCR/model route invoked incorrectly |
+| Replay | quote not anchorable, bbox missing, parse trace span missing, source hash mismatch |
+
+Done when every low-score case has a stable bucket and a reproducible fixture.
+
+### Phase C: Reading Order and Text Normalization
+
+Status: partial. Pass2 added page-number filtering and false table suppression,
+but NID is still `0.739`, below the previous optimized-timeout `0.766` and far
+below the OpenDataLoader/Docling reference range.
+
+Target the OpenDataLoader base NID range first:
+
+- prefer trustworthy tagged-PDF structure trees
+- strengthen XY-Cut++ only where structure is absent or suspect
+- suppress page numbers, repeated headers/footers, duplicate/background text
+- dehyphenate line wraps
+- join paragraph lines without flattening lists/tables
+- preserve quote anchors through `parse_trace`
+
+Short-term target:
+
+```text
+NID >= 0.84
+NID-S >= 0.86
+```
+
+Mid-term target:
+
+```text
+NID >= 0.89
+NID-S >= 0.89
+```
+
+### Phase D: Table Cluster Port
+
+Status: partial. Pass2 fixed row/column range export and added guarded spatial
+table fallback, lifting TEDS to `0.188`, but real Rust-core table clustering and
+complex table structure remain pending.
+
+Port OpenDataLoader-style `table_method=cluster` behavior into Rust-owned
+DocTruth logic with attribution and tests:
+
+- table presence detection
+- bordered-grid detection
+- whitespace/text-spatial clustering for borderless tables
+- row and column boundary inference
+- merged-cell inference
+- table caption association
+- continuation/adjacent-page table handling
+- deterministic HTML table rendering for bench compatibility
+- TrustTable/TrustUnit evidence and bbox preservation
+
+Short-term target:
+
+```text
+TEDS >= 0.25
+TEDS-S >= 0.30
+```
+
+Mid-term target:
+
+```text
+TEDS >= 0.45
+TEDS-S >= 0.50
+```
+
+Hybrid target:
+
+```text
+TEDS >= 0.80
+```
+
+### Phase E: Heading and Section Tree
+
+Status: partial. Pass2 export-layer heading promotion reduced missing-heading
+failures, but MHS is still `0.196` and heading hierarchy mismatch is the largest
+remaining failure bucket.
+
+Build a real section model instead of export-only heading promotion:
+
+- title detection from font size/weight/position
+- heading detection from font/style/numbering/spacing
+- heading level assignment
+- heading/content grouping
+- false-heading suppression for table cells, headers, sidebars, and captions
+- Markdown heading rendering from the section tree
+- `content_blocks.json` and `parse_trace.json` section linkage
+
+Short-term target:
+
+```text
+MHS >= 0.45
+MHS-S >= 0.55
+```
+
+Mid-term target:
+
+```text
+MHS >= 0.70
+MHS-S >= 0.80
+```
+
+### Phase F: OCR and Model Routing
+
+Benchmark scanned/no-text cases through the existing Rust-owned worker route:
+
+- detect no-text or low-text pages
+- route OCR through configured local worker
+- preserve OCR bbox/confidence in TrustDocument
+- block audit-grade when OCR confidence is low
+- keep model workers optional and local-first
+
+This phase should not make OCR mandatory for normal text-layer PDFs.
+
+### Phase G: Optional Hybrid Advisor
+
+Use Docling/OpenDataLoader hybrid as a dev/test advisor:
+
+- compare DocTruth parse trace to Docling/OpenDataLoader output
+- record disagreements as warnings or triage labels
+- use disagreement cases to add Rust tests
+- do not make Docling output canonical
+- do not add heavy hybrid runtime as default OSS path
+
+Hybrid can be an enterprise/high-accuracy mode later, but the OSS default must
+remain local, Rust-owned, and dependency-conscious.
+
+## TDD Shape
+
+For each metric slice:
+
+1. Pick the worst 5-20 real PDFs from the bench report.
+2. Add minimal Rust fixtures or copied public bench cases where license allows.
+3. Write RED tests at the Rust runtime boundary.
+4. Implement the parser behavior in `runtime/doctruth-runtime`.
+5. Run focused tests.
+6. Run a partial OpenDataLoader Bench subset.
+7. Run full 200-PDF bench before claiming score movement.
+8. Record exact metrics and changed case IDs.
+
+## Acceptance Targets
+
+### Near-Term
+
+```text
+overall >= 0.65
+NID >= 0.84
+TEDS >= 0.25
+MHS >= 0.45
+full bench completes with bounded timeouts
+```
+
+### OpenDataLoader Base Parity
+
+```text
+overall >= 0.80
+NID >= 0.89
+TEDS >= 0.45
+MHS >= 0.70
+```
+
+### High-Accuracy Reference Range
+
+```text
+overall >= 0.88
+NID >= 0.90
+TEDS >= 0.85
+MHS >= 0.80
+```
+
+Reaching the high-accuracy range probably requires a hybrid/model-assisted path,
+not only deterministic text-layer heuristics.
+
+## Immediate Next Work
+
+1. Extend the new Rust `parseTrace.pages[].textSpans[]` observation layer into
+ real XY-Cut++ diagnostics and per-page debug span artifacts, so reading-order
+ fixes can be tested before Markdown export.
+2. Move table-cluster behavior from export-layer fallback into
+ `runtime/doctruth-runtime`, with Rust fixtures for bordered, borderless,
+ merged-cell, continuation, and OpenDataLoader-style `method="cluster"`
+ cases.
+3. Calibrate the Rust-owned section tree against real
+ `heading_hierarchy_mismatch` failures: centered titles, sidebar labels,
+ title/subtitle stacks, and false title-case body lines. The section metadata
+ contract now exists; the remaining work is benchmark-grade inference.
+4. Restore and lift NID with paragraph joining, dehyphenation, header/footer
+ suppression, and safer multi-column ordering.
+5. Run the OCR/model-worker path against no-text/scanned benchmark cases so
+ zero-score OCR pages are not silently treated as text-layer failures.
+6. Keep generated prediction artifacts ignored unless a small fixture is
+ intentionally checked in for a RED test.
diff --git a/docs/plans/2026-06-18-opendataloader-rustification-tdd-plan.md b/docs/plans/2026-06-18-opendataloader-rustification-tdd-plan.md
new file mode 100644
index 00000000..5e1063ee
--- /dev/null
+++ b/docs/plans/2026-06-18-opendataloader-rustification-tdd-plan.md
@@ -0,0 +1,631 @@
+# OpenDataLoader Hybrid Rustification TDD Plan
+
+Date: 2026-06-18
+
+Status: superseded for execution by
+`docs/plans/2026-06-23-java-core-rust-shell-opendataloader-parity.md`
+
+Owner: DocTruth
+
+## Goal
+
+Make DocTruth practical for edge and local-agent use by turning the proven
+OpenDataLoader hybrid quality path into a DocTruth-owned runtime path, then
+progressively replacing the Python/Torch-heavy pieces with Rust and MNN-first
+lazy model runtime.
+
+Correction: this plan's practical intent was to preserve OpenDataLoader-quality
+parsing while Rustifying the expensive Python/Docling/Torch outer runtime. It
+must not be read as "replace the Java/PDFBox/OpenDataLoader-compatible parser
+quality core with a from-scratch Rust parser before benchmark parity." Current
+execution keeps the Java/OpenDataLoader-compatible parser core as the quality
+source of truth and makes Rust own the runtime shell, MNN worker boundary,
+benchmark runner, resource accounting, and Python replacement path.
+
+This plan supersedes the idea that DocTruth v1 should first become a fully
+from-scratch Rust parser. The more practical route is:
+
+```text
+OpenDataLoader hybrid quality baseline first
+-> DocTruth TrustDocument adapter
+-> Rust deterministic local parser parity
+-> MNN-first lazy model runtime
+-> OpenDataLoader/Docling/Python/Torch as benchmark oracle only
+```
+
+The target is not to make OpenDataLoader, Docling, or MinerU schemas canonical.
+`TrustDocument` remains canonical. External parser output is input evidence and
+quality reference only.
+
+## Current Measured Baseline
+
+The live OpenDataLoader hybrid benchmark was run locally against the vendored
+OpenDataLoader Bench corpus.
+
+```text
+engine: opendataloader-hybrid 2.2.1
+corpus: 200 PDFs
+quality:
+ overall: 0.9065718466674022
+ NID: 0.9337307553293448
+ TEDS: 0.9276430534097512
+ MHS: 0.8207761855598542
+speed:
+ parser total: 125.29678010940552s
+ parser avg: 0.6264839005470276s/doc
+ command wall: 130.33s
+resources:
+ docling-fast hybrid server RSS: about 1.39GB to 1.51GB
+ client/JAR full-run peak RSS: about 408MB
+ warm client single-run peak RSS: about 140MB
+```
+
+Interpretation:
+
+```text
+OpenDataLoader hybrid quality works.
+DocTruth parser quality does not yet match it.
+The current memory problem is mostly Docling/Torch/model runtime, not Java alone.
+```
+
+## Architecture Direction
+
+### Runtime Tiers
+
+DocTruth should expose three parser tiers under one TrustDocument contract.
+
+```text
+Tier 0: Rust local deterministic
+ PDF substrate, spans, bbox, XY-Cut++, safety filters, table geometry,
+ heading/list/section inference.
+ Default for local/edge use.
+
+Tier 1: Rust + MNN lazy model runtime
+ Layout/table/OCR models loaded on demand.
+ ONNX is allowed only as a conversion interchange artifact.
+ MNN is the production local model format.
+ Target for high-quality local use without Python/Torch server residency.
+
+Tier 2: OpenDataLoader hybrid benchmark oracle
+ opendataloader-pdf + docling-fast/Torch.
+ Highest current quality reference.
+ Not a production fallback path.
+ Used for benchmark reproduction, migration comparison, and quality triage.
+```
+
+### Reference Composition
+
+```text
+OpenDataLoader Bench = objective parser-quality gate
+OpenDataLoader PDF = high-quality hybrid baseline and behavior reference
+Docling = layout/table model quality reference
+Kreuzberg = Rust runtime/model cache/worker architecture reference
+MinerU = layered output product reference
+DocTruth = TrustDocument, sourceRefs, parseTrace, audit, replay
+```
+
+No external schema becomes canonical. All outputs normalize into:
+
+```text
+TrustDocument
+contentBlocks
+parseTrace
+sourceRefs
+audit JSON
+replay artifacts
+benchmark reports
+```
+
+## TDD Rules For This Work
+
+Every implementation slice must follow red-green-refactor.
+
+Required evidence per slice:
+
+```text
+1. RED test added first
+2. RED failure captured in progress.md
+3. minimal implementation
+4. GREEN test output captured in progress.md
+5. benchmark or smoke delta recorded when applicable
+6. no production behavior marked complete without a failing test first
+```
+
+Do not claim quality improvement from code review or screenshots. Quality claims
+must come from:
+
+```text
+OpenDataLoader Bench metrics
+DocTruth benchmark-corpus reports
+per-case regression fixtures
+resource measurements
+```
+
+## Phase 1: Live Hybrid Benchmark Oracle Adapter
+
+Goal: make the current quality baseline reproducible from DocTruth benchmark
+tooling without turning OpenDataLoader hybrid into a production parser backend
+or runtime fallback.
+
+Scope:
+
+```text
+- Add a DocTruth benchmark oracle adapter named opendataloader-hybrid.
+- Start/reuse a local hybrid server only from benchmark/oracle commands.
+- Call opendataloader-pdf hybrid conversion.
+- Capture produced Markdown.
+- Record backend provenance:
+ - opendataloader-pdf version
+ - docling version
+ - hybrid mode
+ - server URL
+ - runtime RSS if measurable
+ - elapsed time
+- Normalize output into TrustDocument.
+- Mark evidence grade honestly:
+ - Markdown-only mapping is not span-perfect.
+ - sourceRefs are coarse until structured/bbox adapter lands.
+- Do not expose this as an automatic runtime fallback for production parsing.
+```
+
+TDD tests:
+
+```text
+RED: benchmark oracle command rejects opendataloader-hybrid when dependency is missing with a clear doctor hint.
+RED: benchmark oracle command accepts opendataloader-hybrid and emits TrustDocument with parserRun.backend.
+RED: parserRun records externalBackend provenance and elapsedMs.
+RED: audit status is NOT_AUDIT_GRADE when only Markdown-level source mapping exists.
+RED: benchmark adapter can run one vendored OpenDataLoader PDF through the backend.
+RED: production parse profiles cannot auto-select opendataloader-hybrid.
+```
+
+Done when:
+
+```text
+doctruth benchmark-oracle --engine opendataloader-hybrid --json
+```
+
+produces a valid TrustDocument and a recorded one-document benchmark smoke.
+
+## Phase 2: OpenDataLoader Structured Output Adapter
+
+Goal: stop treating Markdown as the only output and extract the richest
+available OpenDataLoader object structure before rendering.
+
+Scope:
+
+```text
+- Investigate opendataloader-pdf public API for structured objects.
+- Prefer object/block/table/list/heading output over Markdown parsing.
+- Map OpenDataLoader object types into TrustDocument units.
+- Preserve table cells, heading levels, lists, reading order, and coarse bbox
+ if available.
+- Keep Markdown as a lossy export, not the source of truth.
+```
+
+TDD tests:
+
+```text
+RED: known table PDF maps to TrustDocument TABLE with expected row/column counts.
+RED: known heading PDF maps to contentBlocks heading levels without Markdown inference.
+RED: list PDF preserves list items as list blocks.
+RED: adapter emits source mapping quality = structured when block ids are available.
+RED: adapter falls back to Markdown only with explicit warning when structured API is unavailable.
+```
+
+Done when:
+
+```text
+OpenDataLoader object/block output -> TrustDocument
+```
+
+is the default for this backend, with Markdown as a secondary export.
+
+## Phase 3: Rust Deterministic Parity For Non-Model Work
+
+Goal: move the deterministic parts that do not require Docling/Torch into Rust,
+using OpenDataLoader behavior as the reference.
+
+Scope:
+
+```text
+- PDF substrate and glyph/span extraction through Rust.
+- safety filters:
+ - whitespace
+ - off-page
+ - tiny text
+ - duplicate text
+ - invisible render mode
+ - near-white/background-like text
+ - hidden OCG when substrate exposes enough data
+- tagged-PDF structure-tree preference.
+- XY-Cut++ reading order.
+- table geometry:
+ - bordered tables
+ - cluster/borderless tables
+ - sparse rows
+ - empty-cell preservation
+ - continued table detection
+- heading/list/section tree.
+```
+
+TDD tests:
+
+```text
+RED: per-case OpenDataLoader Bench failures become Rust fixtures.
+RED: each fixture asserts TrustDocument, not Markdown-only output.
+RED: fixture tags cover reading-order, table, heading, safety-filter, source-map.
+RED: benchmark report rejects claiming parity without external NID/TEDS/MHS gates.
+```
+
+Done when:
+
+```text
+Rust local deterministic backend beats current DocTruth pass2 scores materially
+and closes a documented subset of OpenDataLoader hybrid failures without model use.
+```
+
+The target for this phase is not full hybrid parity. It is to avoid model
+startup for ordinary text-layer PDFs.
+
+## Phase 4: MNN-First Model Runtime Boundary
+
+Goal: replace always-on Python/Torch/Docling server residency with a single
+production model path: Rust orchestrates lazy local MNN model execution.
+
+Scope:
+
+```text
+- Define model manifest contract for layout/table/OCR models.
+- Use ONNX only as an intermediate conversion artifact.
+- Convert ONNX artifacts to MNN before production packaging.
+- Ship MNN artifacts for local runtime.
+- Support FP32 MNN by default.
+- Allow MNN weight-only 8-bit models only after benchmark delta is proven.
+- Add lazy MNN model loading and unload policy.
+- Add page-level routing:
+ - simple text page -> Rust deterministic only
+ - complex layout/table page -> MNN layout/table model
+ - scanned/OCR page -> MNN OCR model
+- Record model provenance and resource metrics in parserRun.
+- Fail closed when a required MNN model is unavailable.
+- Do not silently fall back to ONNX Runtime, Torch, Docling, Tesseract, PDFBox,
+ or another parser backend.
+```
+
+Candidate model families:
+
+```text
+layout:
+ RT-DETR/DocLayNet-style layout detector
+ Docling layout model only if it can be converted into the MNN runtime path
+
+table:
+ TATR / Table Transformer
+ SLANeXT / SLANet-style table recognizer where licensing and runtime permit
+
+OCR:
+ RapidOCR/MNN
+ MNN-compatible OCR models with pinned manifest and corpus validation
+```
+
+TDD tests:
+
+```text
+RED: model manifest SHA mismatch blocks model use.
+RED: missing required MNN model fails the requested model feature or marks output not audit-grade; it does not invoke another runtime.
+RED: simple PDF does not start MNN runtime.
+RED: table-heavy PDF routes only relevant pages to table model.
+RED: scanned PDF routes to MNN OCR model.
+RED: ONNX artifact is accepted only by the conversion toolchain, not by production parse runtime.
+RED: Torch/Docling/OpenDataLoader hybrid cannot be selected as automatic runtime fallback.
+RED: resource report includes model cold-start, inference time, and peak RSS when measurable.
+```
+
+Done when:
+
+```text
+DocTruth can parse a mixed corpus with lazy MNN model startup and lower steady
+RSS than docling-fast/Torch while keeping documented quality on routed cases.
+```
+
+## Phase 5: Resource Gate And Edge Profile
+
+Goal: make edge/local-agent use measurable and enforceable.
+
+Profiles:
+
+```text
+edge-fast:
+ Rust deterministic only.
+ No network.
+ No model server.
+ Target RSS: low tens to low hundreds of MB.
+
+edge-model:
+ Rust deterministic + lazy MNN runtime.
+ No Torch.
+ Model cache verified.
+ Target RSS: measured per model manifest and platform, materially below the
+ docling-fast/Torch oracle, and released toward the profile idle budget after
+ unload. No universal absolute RSS gate before the real MNN profile report.
+
+benchmark-oracle:
+ OpenDataLoader hybrid/docling-fast.
+ Highest current quality reference.
+ Explicit benchmark/comparison mode only.
+ Not a production parse fallback.
+```
+
+TDD tests:
+
+```text
+RED: doctor reports active profile and unavailable capabilities.
+RED: edge-fast profile rejects model startup.
+RED: edge-model loads MNN models lazily.
+RED: benchmark-oracle refuses to run unless explicitly requested.
+RED: production profiles reject automatic runtime fallback chains.
+RED: parser benchmark report includes RSS/cold-start/warm-run metrics.
+```
+
+Done when:
+
+```text
+doctruth doctor
+doctruth parse --profile edge-fast
+doctruth parse --profile edge-model
+doctruth parse --profile benchmark-oracle
+```
+
+have explicit, tested behavior and resource reports.
+
+## Phase 6: Benchmark Gates And Promotion Criteria
+
+Goal: prevent parser-quality claims from drifting back into subjective language.
+
+Required benchmark lanes:
+
+```text
+1. DocTruth seed corpus
+2. OpenDataLoader Bench one-doc smoke
+3. OpenDataLoader Bench subset by fixture type
+4. OpenDataLoader Bench full 200 PDFs
+5. replay-validity benchmark
+6. resource benchmark
+```
+
+Promotion gates:
+
+```text
+OpenDataLoader hybrid benchmark oracle:
+ must reproduce published/local hybrid baseline metrics within tolerance.
+ must not be promoted as production runtime fallback.
+
+Rust deterministic:
+ must improve over current DocTruth runtime baseline and report known gaps.
+
+Rust + MNN:
+ must prove lower steady RSS than docling-fast and pass routed-case quality gates.
+ ONNX artifacts are not production runtime artifacts.
+ must run OpenDataLoader Bench because converted MNN models may degrade quality.
+ quality may be slightly lower than OpenDataLoader hybrid oracle, but not
+ materially worse.
+ performance and resource use must be materially better than docling-fast/Torch.
+
+Audit-grade:
+ requires TrustDocument sourceRefs, quote replayability, evidence-span
+ replayability, source hashes, parser warnings, and benchmark report binding.
+```
+
+Done when:
+
+```text
+No parser backend can be promoted to audit-grade only because its Markdown looks good.
+```
+
+### Final MNN Acceptance Gate
+
+The MNN production runtime is accepted only when it passes a full measured
+quality and resource gate against the same OpenDataLoader Bench corpus used for
+the OpenDataLoader hybrid oracle.
+
+Required run:
+
+```text
+DocTruth MNN runtime -> TrustDocument -> OpenDataLoader Bench prediction format
+OpenDataLoader Bench evaluator -> NID/TEDS/MHS/overall
+DocTruth resource benchmark -> cold start, warm latency, steady RSS, peak RSS
+```
+
+Reference baseline:
+
+```text
+OpenDataLoader hybrid oracle:
+ overall: 0.9065718466674022
+ NID: 0.9337307553293448
+ TEDS: 0.9276430534097512
+ MHS: 0.8207761855598542
+ RSS: about 1.39GB to 1.51GB for docling-fast server
+ speed: about 0.626s/doc on the measured full run
+```
+
+Initial acceptance target:
+
+```text
+Quality:
+ overall >= 0.88
+ NID >= 0.91
+ TEDS >= 0.88
+ MHS >= 0.78
+
+Resource/performance:
+ no Python/Torch/Docling process in production parse runtime
+ steady RSS must be materially lower than the measured docling-fast/Torch oracle
+ cold start must be materially lower than docling-fast server startup
+ warm per-doc latency should be competitive with OpenDataLoader hybrid
+ absolute RSS values are measured budgets first, not universal product gates
+ no implementation is accepted or rejected solely because it matches an
+ arbitrary RSS number before a named profile report exists
+```
+
+The quality thresholds are explicit gates because they describe user-visible
+parser quality. The resource thresholds are deliberately profile-based because
+one memory number cannot honestly cover every model, page crop policy, allocator,
+and machine. Resource gates are split into three levels:
+
+```text
+Level 1 hard gate:
+ production parse runtime must not keep Python/Torch/Docling resident.
+
+Level 2 comparative gate:
+ Rust + MNN must be materially lighter than the measured docling-fast/Torch
+ oracle on the same corpus and machine.
+
+Level 3 profile regression gate:
+ after a specific model manifest/platform/corpus has a measured report, future
+ releases for that profile must not materially regress without a new report and
+ rationale.
+```
+
+This matters because model size, precision mode, platform allocator behavior,
+crop buffers, batching, and unload policy can change the absolute RSS profile.
+The production resource hard gates are:
+
+```text
+- no Python/Torch/Docling process in production parse runtime
+- steady RSS must be materially lower than the measured docling-fast/Torch oracle
+- memory must return toward the configured idle budget after model unload
+- each accepted model profile must publish cold-load RSS, warm steady RSS, peak
+ RSS, idle-after-unload RSS, cold latency, warm latency, and corpus scope
+```
+
+Do not hard-code a universal absolute RSS threshold such as `steady RSS <=
+600MB`. That would make the plan look precise while hiding the variables that
+actually decide memory use.
+
+Absolute RSS numbers are profiling budgets first. They become regression guards
+only after a full benchmark report records the actual model set, precision
+mode, platform, corpus scope, crop buffers, warm/idle behavior, unload policy,
+and repeated-run variance. After that report exists, convert the observed
+budget into a named profile guard with platform and model manifest names
+attached. The guard protects against silent regression for that profile; it is
+not a product-wide promise for every model or every machine.
+
+Initial profiling budgets should be recorded per profile:
+
+```text
+edge-fast:
+ expected to stay in low tens to low hundreds of MB because it does not load
+ model runtimes.
+
+edge-model:
+ expected to remain far below the docling-fast/Torch oracle in steady state.
+ record cold-load RSS, warm steady RSS, peak RSS, and idle-after-unload RSS.
+ the first absolute target is set only after the first full MNN benchmark run.
+ express it as a regression guard for that measured profile instead of a
+ universal product promise.
+
+edge-high-accuracy:
+ allowed to use larger MNN model manifests when quality requires them.
+ must still avoid Python/Torch/Docling residency and publish the same resource
+ breakdown. It is compared against the heavy oracle and against the previous
+ accepted high-accuracy profile, not against the edge-fast budget.
+
+Example:
+ if a specific Mac ARM64 edge-model profile with a pinned model manifest
+ measures 451MB warm steady RSS, that number is recorded as the baseline for
+ that exact profile. The guard should then be derived from repeated-run
+ variance and release risk, for example "do not materially regress from the
+ recorded Mac ARM64 edge-model baseline without an updated benchmark report,"
+ rather than "all edge-model builds must stay below 600MB".
+```
+
+This means `451MB` is evidence, not policy. A future MNN OCR model, table model,
+larger crop buffer, or Windows allocator may have a different absolute budget.
+The acceptance target is therefore not `451MB + steady RSS <= 600MB`; it is
+near-hybrid quality, no Python/Torch/Docling production residency, lazy MNN
+loading, measurable unload behavior, and no unexplained regression against a
+named profile baseline.
+
+Practical interpretation: before the first MNN profile report, compare against
+the measured heavy oracle and record the full resource breakdown. After the
+first report, use that named profile as the baseline for future regression
+checks. Do not turn a provisional measurement into a product-wide limit.
+The product-level policy is:
+
+```text
+1. production runtime has no Python/Torch/Docling process
+2. edge-model is lazy-loaded
+3. idle unload is measurable
+4. each model profile publishes its own budget
+5. profile releases cannot materially regress without a new benchmark report
+6. quality gates still apply to the same benchmark corpus
+```
+
+Any resource threshold change must be committed with:
+
+```text
+- full benchmark report
+- per-case regression report
+- resource report
+- model-by-model RSS and latency breakdown when measurable
+- explanation of whether loss comes from conversion, quantization, routing, model choice, or runtime buffers
+- updated target and rationale
+```
+
+Done when:
+
+```text
+The MNN runtime proves near-hybrid quality with substantially lower resource
+use, or the report clearly identifies the model/conversion gap blocking
+promotion.
+```
+
+## Expected Outcome
+
+This route gives DocTruth a practical product path:
+
+```text
+Short term:
+ Use OpenDataLoader hybrid as an explicit heavy benchmark oracle.
+
+Medium term:
+ Move the deterministic parser brain into Rust and avoid models for ordinary PDFs.
+
+Long term:
+ Replace Python/Torch residency with lazy MNN model runtime where model quality
+ is necessary.
+```
+
+The key product claim becomes:
+
+```text
+DocTruth can choose the cheapest parser path that preserves replayable evidence.
+```
+
+not:
+
+```text
+DocTruth rewrote every document parser from scratch in Rust before it works.
+```
+
+## Immediate Next TDD Slice
+
+Start with Phase 1.
+
+First RED tests:
+
+```text
+1. Benchmark-oracle command exposes `--engine opendataloader-hybrid` and fails
+ clearly when the dependency is missing.
+2. A fake OpenDataLoader hybrid oracle runner returns Markdown and provenance;
+ DocTruth maps it into TrustDocument with
+ `parserRun.backend=opendataloader-hybrid-oracle`.
+3. Markdown-only source mapping marks output `NOT_AUDIT_GRADE` with a clear
+ warning.
+4. Production parse profiles cannot auto-select OpenDataLoader hybrid.
+5. The one-document OpenDataLoader Bench smoke can use this oracle adapter and
+ write a benchmark report.
+```
+
+Only after those tests fail for the right reason should implementation begin.
diff --git a/docs/plans/2026-06-23-java-core-rust-shell-opendataloader-parity.md b/docs/plans/2026-06-23-java-core-rust-shell-opendataloader-parity.md
new file mode 100644
index 00000000..1d672649
--- /dev/null
+++ b/docs/plans/2026-06-23-java-core-rust-shell-opendataloader-parity.md
@@ -0,0 +1,786 @@
+# Java Core Rust Shell OpenDataLoader Parity Implementation Plan
+
+> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
+
+**Goal:** Reach OpenDataLoader benchmark parity by keeping the proven Java/PDFBox/OpenDataLoader-style parser quality path as the document parsing core, while replacing Python/Docling/Torch runtime shells with Rust-owned orchestration, model workers, benchmark execution, and TrustDocument normalization.
+
+**Architecture:** Java owns the canonical document parser backend for PDF text extraction, layout geometry, table heuristics, headings, reading order, veraPDF/PDFBox compatibility, and TrustDocument emission. Rust owns the long-running local runtime shell: process lifecycle, corpus runner, resource accounting, MNN model worker, JSONL protocol, OpenDataLoader Bench prediction generation, and Python-free default execution. Python/OpenDataLoader original runners remain oracle-only fixtures, never production fallback.
+
+**Tech Stack:** Java 25/Maven, Apache PDFBox 3, existing DocTruth TrustDocument model, OpenDataLoader PDF reference under `third_party/`, Rust/Cargo, serde/serde_json, stdio JSONL, MNN model worker boundary, OpenDataLoader Bench corpus/evaluator.
+
+---
+
+## Why This Replaces The Previous Execution Direction
+
+The previous plan had the right practical insight but the wrong enforcement:
+
+```text
+OpenDataLoader hybrid quality baseline first
+-> DocTruth TrustDocument adapter
+-> Rust deterministic local parser parity
+-> MNN-first lazy model runtime
+-> OpenDataLoader/Docling/Python/Torch as benchmark oracle only
+```
+
+What went wrong:
+
+- Repo policy and several docs over-rotated to "Rust parser core replaces Java/PDFBox."
+- Implementation then chased Rust parser heuristics directly instead of first preserving the Java/OpenDataLoader quality path.
+- The Rust parity matrix mostly records partial processor behavior, not full OpenDataLoader algorithm parity.
+- Current full200 score proves the gap: `overall=0.745414`, with the largest misses in reading order, heading hierarchy, and table structure.
+
+Corrected direction:
+
+```text
+Java/OpenDataLoader-compatible parser core = quality source of truth
+Rust runtime shell = Python/Torch/Docling replacement and edge runtime
+TrustDocument = canonical DocTruth schema
+OpenDataLoader original = benchmark oracle only
+```
+
+This is not a brand-new product strategy. It is a corrective execution plan for the already intended practical path: preserve parser accuracy first, then Rustify the expensive outer runtime.
+
+## Non-Negotiable Boundaries
+
+- Do not replace the Java parser core until benchmark parity is achieved and a separate Rust core ADR is approved.
+- Do not add Python as a production fallback.
+- Do not run one Java process per PDF in benchmark mode; the Java backend must stay warm across the corpus.
+- Do not claim OpenDataLoader parity from fixture-only tests.
+- Do not make external schemas canonical. Normalize everything through TrustDocument.
+- Do not hide quality loss behind resource wins. Benchmark quality and runtime metrics must be reported together.
+
+## Current Evidence Baseline
+
+Use this as the current regression target:
+
+```text
+branch: feat/opendataloader-parity-coverage
+run: third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-current-20260623-180244/
+parsed: 199/200
+elapsed: 221.6s
+mean: 1.11s/doc
+overall: 0.745414
+nid: 0.860092
+teds: 0.496416
+mhs: 0.483837
+```
+
+Largest gap buckets from the current triage:
+
+```text
+reading_order_or_text_normalization: 89
+heading_hierarchy_mismatch: 76
+heading_missing: 7
+table_structure_mismatch: 16
+table_missing: 8
+text_noise_or_duplicates: 2
+text_missing_or_truncated: 2
+```
+
+## Phase 1: Fix Product/Architecture Contracts
+
+### Task 1.1: Rewrite parser ownership docs
+
+Files:
+
+- `AGENTS.md`
+- `docs/pdf-parser-runtime-prd.md`
+- `docs/plans/2026-06-18-opendataloader-rustification-tdd-plan.md`
+- `docs/plans/python-to-rust-parser-parity.md`
+- `docs/parser/opendataloader-parity-matrix.md`
+
+Change:
+
+- State that Java/PDFBox/OpenDataLoader-compatible parsing is the current default quality core.
+- State that Rust owns runtime shell, worker lifecycle, model runtime, corpus runner, resource accounting, and optional future parser modules.
+- State that Python/OpenDataLoader original is oracle-only.
+- State that "Rust parser core" is a future ADR, not current default.
+
+Tests:
+
+- Update `src/test/java/ai/doctruth/ArchitectureContractTest.java` to assert these exact policy lines exist:
+ - `Java/OpenDataLoader-compatible parser core is the current quality source of truth`
+ - `Rust owns the runtime shell and Python replacement boundary`
+ - `Python/OpenDataLoader original runners are oracle-only`
+
+Verification:
+
+```bash
+mvn -q -Dtest=ArchitectureContractTest test
+git diff --check
+```
+
+Commit:
+
+```bash
+git add AGENTS.md docs/pdf-parser-runtime-prd.md docs/plans/2026-06-18-opendataloader-rustification-tdd-plan.md docs/plans/python-to-rust-parser-parity.md docs/parser/opendataloader-parity-matrix.md src/test/java/ai/doctruth/ArchitectureContractTest.java
+git commit -m "docs: correct opendataloader parser ownership boundary"
+```
+
+## Phase 2: Promote Java OpenDataLoader Backend From Oracle To First-Class Local Backend
+
+### Task 2.1: Add backend contract tests before implementation
+
+Files:
+
+- `src/test/java/ai/doctruth/opendataloader/OpenDataLoaderJavaBackendContractTest.java`
+- `src/test/java/ai/doctruth/opendataloader/OpenDataLoaderBackendProtocolTest.java`
+
+Test cases:
+
+- A sample PDF produces a backend response with:
+ - `backend = "opendataloader-java-core"`
+ - `schemaVersion`
+ - `markdown`
+ - `blocks[]`
+ - `tables[]`
+ - `headings[]`
+ - `sourceMap[]`
+ - `warnings[]`
+ - `metrics`
+- Structured blocks include `id`, `kind`, `pageIndex`, `bbox`, `readingOrder`, `text`.
+- Tables include cell-level row/column coordinates when available.
+- Response can be converted to `TrustDocument` without losing source refs.
+
+Verification should fail before implementation:
+
+```bash
+mvn -q -Dtest=OpenDataLoaderJavaBackendContractTest,OpenDataLoaderBackendProtocolTest test
+```
+
+### Task 2.2: Implement Java backend DTOs and parser facade
+
+Files:
+
+- `src/main/java/ai/doctruth/opendataloader/OpenDataLoaderBackendRequest.java`
+- `src/main/java/ai/doctruth/opendataloader/OpenDataLoaderBackendResponse.java`
+- `src/main/java/ai/doctruth/opendataloader/OpenDataLoaderBlock.java`
+- `src/main/java/ai/doctruth/opendataloader/OpenDataLoaderTable.java`
+- `src/main/java/ai/doctruth/opendataloader/OpenDataLoaderTableCell.java`
+- `src/main/java/ai/doctruth/opendataloader/OpenDataLoaderSourceRef.java`
+- `src/main/java/ai/doctruth/opendataloader/OpenDataLoaderJavaBackend.java`
+- `src/main/java/ai/doctruth/opendataloader/OpenDataLoaderTrustDocumentAdapter.java`
+
+Implementation:
+
+- Reuse existing `PdfDocumentParser`, `PdfPageBlockExtractor`, `PdfPageTableExtractor`, `PdfBorderlessTableExtractor`, `PdfSemanticSectionCoalescer`, and `TrustDocumentParser`.
+- Do not duplicate parser algorithms in Rust for this phase.
+- Expose the parser output as OpenDataLoader-shaped structured blocks, then normalize into TrustDocument.
+- Keep warning codes explicit for unsupported exact parity features.
+
+Verification:
+
+```bash
+mvn -q -Dtest=OpenDataLoaderJavaBackendContractTest,OpenDataLoaderBackendProtocolTest test
+mvn -q -Dtest=PdfDocumentParserTest,PdfVisualLayoutParserTest,PdfTwoColumnSemanticSectionTest,PdfBorderlessTableExtractionTest test
+```
+
+Commit:
+
+```bash
+git add src/main/java/ai/doctruth/opendataloader src/test/java/ai/doctruth/opendataloader
+git commit -m "feat: add opendataloader java parser backend"
+```
+
+## Phase 3: Add Warm Java Backend Process For Rust Runtime
+
+### Task 3.1: Add Java JSONL backend CLI
+
+Files:
+
+- `src/main/java/ai/doctruth/opendataloader/OpenDataLoaderBackendCli.java`
+- `src/test/java/ai/doctruth/opendataloader/OpenDataLoaderBackendCliTest.java`
+- `src/main/java/ai/doctruth/cli/DocTruthCli.java`
+- `src/main/java/ai/doctruth/cli/Usage.java`
+
+Behavior:
+
+- Add hidden/developer command:
+
+```bash
+doctruth opendataloader-backend --stdio-jsonl
+```
+
+- Read one JSON request per line from stdin.
+- Write one JSON response per line to stdout.
+- Keep JVM process alive across documents.
+- Return structured error JSON instead of crashing on one bad PDF.
+- Include per-document parse timings and peak Java process metadata when available.
+
+Tests:
+
+- CLI parses two requests through one process.
+- Malformed request returns structured error and process stays alive.
+- Unsupported options are rejected fail-closed.
+
+Verification:
+
+```bash
+mvn -q -Dtest=OpenDataLoaderBackendCliTest test
+```
+
+### Task 3.2: Add Rust warm-process client
+
+Files:
+
+- `runtime/doctruth-runtime/src/opendataloader_java_backend.rs`
+- `runtime/doctruth-runtime/src/lib.rs`
+- `runtime/doctruth-runtime/src/main.rs`
+- `runtime/doctruth-runtime/tests/opendataloader_java_backend_contract.rs`
+
+Behavior:
+
+- Spawn the Java backend once for a benchmark run.
+- Send JSONL requests and parse JSONL responses.
+- Track startup time separately from per-document parse time.
+- Kill the child process at the end of the run.
+- Fail closed if the Java backend exits or emits invalid JSON.
+
+Tests:
+
+- A fake JSONL worker proves Rust sends multiple documents to one process.
+- A fake worker with bad JSON returns a structured error.
+- A fake worker with one failed PDF continues to parse the next request.
+
+Verification:
+
+```bash
+cd runtime/doctruth-runtime && cargo test --test opendataloader_java_backend_contract
+```
+
+Commit:
+
+```bash
+git add src/main/java/ai/doctruth/opendataloader src/test/java/ai/doctruth/opendataloader runtime/doctruth-runtime/src runtime/doctruth-runtime/tests
+git commit -m "feat: add warm java opendataloader backend bridge"
+```
+
+## Phase 4: Route OpenDataLoader Bench Through The Java Quality Core
+
+### Task 4.1: Add backend mode to Rust benchmark prediction generator
+
+Files:
+
+- `runtime/doctruth-runtime/src/main.rs`
+- `runtime/doctruth-runtime/src/lib.rs`
+- `runtime/doctruth-runtime/tests/benchmark_corpus_contract.rs`
+- `scripts/run-doctruth-opendataloader-bench.sh`
+
+Behavior:
+
+- Add explicit backend mode:
+
+```bash
+doctruth-runtime opendataloader-prediction \
+ --backend opendataloader-java-core \
+ --manifest third_party/opendataloader-bench/... \
+ --out third_party/opendataloader-bench/prediction/doctruth-java-core-...
+```
+
+- Default benchmark backend should be `opendataloader-java-core`.
+- Existing Rust heuristic backend remains available as `rust-edge-fast`, but not called parity.
+- Prediction output must include:
+ - `backend`
+ - `javaBackendCommand`
+ - `rustRuntimeVersion`
+ - `parserPolicy`
+ - `startupMs`
+ - `perDocumentMs`
+ - `rssSamples`
+ - source hashes
+
+Tests:
+
+- The runner writes OpenDataLoader-compatible Markdown artifacts.
+- The runner records backend metadata.
+- The runner does not invoke Python unless `--oracle-python` is explicitly passed.
+
+Verification:
+
+```bash
+cd runtime/doctruth-runtime && cargo test --test benchmark_corpus_contract
+```
+
+### Task 4.2: Add no-Python default guard
+
+Files:
+
+- `runtime/doctruth-runtime/tests/opendataloader_python_boundary_contract.rs`
+- `scripts/check-no-python-defaults.sh`
+
+Behavior:
+
+- In production/default benchmark mode, these strings must not appear in execution path config:
+ - `python`
+ - `docling`
+ - `torch`
+ - `opendataloader-hybrid`
+- They may appear only under oracle test fixtures and docs that explicitly say oracle-only.
+
+Verification:
+
+```bash
+cd runtime/doctruth-runtime && cargo test --test opendataloader_python_boundary_contract
+bash scripts/check-no-python-defaults.sh
+```
+
+Commit:
+
+```bash
+git add runtime/doctruth-runtime scripts
+git commit -m "feat: route opendataloader bench through java quality backend"
+```
+
+## Phase 5: Port Remaining Python Outer Runtime Responsibilities To Rust
+
+### Task 5.1: Replace Python prediction packaging
+
+Files:
+
+- `runtime/doctruth-runtime/src/opendataloader_prediction.rs`
+- `runtime/doctruth-runtime/src/opendataloader_report.rs`
+- `runtime/doctruth-runtime/tests/opendataloader_prediction_contract.rs`
+
+Behavior:
+
+- Rust writes the exact prediction folder shape expected by OpenDataLoader Bench:
+ - `markdown/`
+ - `summary.json`
+ - `cases/*.json`
+ - `failures/*.json`
+ - `resources.json`
+ - `reference-comparison.json`
+ - `reference-comparison.md`
+- Python evaluator is allowed only as an external oracle command, not packaging logic.
+
+Verification:
+
+```bash
+cd runtime/doctruth-runtime && cargo test --test opendataloader_prediction_contract
+```
+
+### Task 5.2: Keep model execution behind MNN worker boundary
+
+Files:
+
+- `runtime/doctruth-runtime/src/bin/doctruth-mnn-model-worker.rs`
+- `runtime/doctruth-runtime/tests/model_worker_contract.rs`
+- `runtime/doctruth-runtime/tests/opendataloader_model_runtime_contract.rs`
+- `docs/parser/opendataloader-parity-matrix.md`
+
+Behavior:
+
+- MNN model worker remains lazy and optional.
+- Java parser core can request OCR/table/layout model outputs through the Rust worker protocol.
+- No Torch/ONNXRuntime process is used in default mode.
+- If model artifacts are missing, return `MODEL_ARTIFACT_MISSING` and mark the relevant case unsupported; do not silently fall back to Python.
+
+Verification:
+
+```bash
+cd runtime/doctruth-runtime && cargo test --test model_worker_contract --test opendataloader_model_runtime_contract
+```
+
+Commit:
+
+```bash
+git add runtime/doctruth-runtime docs/parser/opendataloader-parity-matrix.md
+git commit -m "feat: make rust own opendataloader packaging and model worker boundary"
+```
+
+## Phase 6: Restore OpenDataLoader Algorithm Coverage In Java
+
+### Task 6.1: Build a processor parity checklist from reference behavior
+
+Files:
+
+- `docs/parser/opendataloader-processor-gap-report.md`
+- `src/test/java/ai/doctruth/opendataloader/OpenDataLoaderProcessorParityTest.java`
+
+Processor areas:
+
+- PDF text normalization
+- hidden/off-page/tiny/background text filtering
+- duplicate text suppression
+- XY-Cut / geometry projection reading order
+- paragraph/line merging
+- heading promotion and hierarchy
+- table detection
+- borderless table clustering
+- table cell grid reconstruction
+- caption handling
+- OCR region routing
+- scanned PDF error semantics
+
+Tests:
+
+- Each processor area has at least one focused fixture or synthetic contract.
+- Current status is one of:
+ - `matched`
+ - `partial`
+ - `oracle-only`
+ - `missing`
+- No area can be marked `matched` without a focused test and one full-bench evidence case.
+
+Verification:
+
+```bash
+mvn -q -Dtest=OpenDataLoaderProcessorParityTest test
+```
+
+### Task 6.2: Copy/adapt OpenDataLoader behavior in Java first
+
+Files will be added as needed under:
+
+- `src/main/java/ai/doctruth/opendataloader/processors/`
+- `src/test/java/ai/doctruth/opendataloader/processors/`
+
+Implementation order:
+
+1. Hidden/off-page/tiny/background text filters.
+2. Duplicate text suppression.
+3. Geometry projection reading order.
+4. Heading hierarchy reconstruction.
+5. Table border/cluster heuristics.
+6. Borderless table reconstruction.
+7. Caption binding.
+8. OCR region routing contract.
+
+Rule:
+
+- Copy/adapt behavior from the Apache-2.0 OpenDataLoader reference where available.
+- Keep license attribution in `NOTICE` and local source comments for copied/adapted algorithm sections.
+- Do not implement targeted one-off fixes for only one benchmark PDF unless the rule generalizes and has a focused test.
+
+Verification after each processor group:
+
+```bash
+mvn -q -Dtest='ai.doctruth.opendataloader.**.*Test' test
+```
+
+Commit after each meaningful processor group:
+
+```bash
+git add src/main/java/ai/doctruth/opendataloader src/test/java/ai/doctruth/opendataloader docs/parser/opendataloader-processor-gap-report.md NOTICE
+git commit -m "feat: align opendataloader behavior"
+```
+
+Current Phase 6 progress:
+
+- Table run segmentation and stacked header-band absorption are implemented in
+ `PdfBorderlessTableExtractor`.
+- First-column continuation merge is implemented for OpenDataLoader-style
+ multi-line cells such as `Environment, Health and Safety`, `Compliances with
+ imprisonment`, and `Percentage of imprisonment clauses`.
+- Spacer-column collapse is implemented for header-only/data-only split columns
+ such as `Small | Medium | | Large`.
+- Verified with `doctruth-java-core-phase6-table-spacer-collapse` smoke:
+ - `01030000000083` TEDS `0.9958`
+ - `01030000000127` TEDS `0.888889`
+- Added wide long-text comparative table recovery for OpenDataLoader case
+ `01030000000088`:
+ - detects 4+ column long-text comparative tables without collapsing the
+ page into one giant table row
+ - uses word-zone column assignment only for the wide-text path, while keeping
+ normal borderless tables on the existing cell-cluster assignment
+ - merges multi-row headers into one Markdown/TrustDocument table header
+ - merges blank-first continuation rows into the prior data row across
+ long-text evidence columns
+- Verified with refreshed Java CLI jar:
+ - `01030000000088` single-doc bench TEDS `0.999827`, TEDS_s `1.0`,
+ overall `0.983936`
+ - `doctruth-java-core-phase6-wide-text-table` smoke parsed 5/5 documents,
+ TEDS mean `0.9979`, no Python/Torch/Docling production residency
+ - smoke cases: `01030000000083` TEDS `0.9958`, `01030000000127` TEDS `1.0`
+- Added dense benchmark matrix table recovery for OpenDataLoader case
+ `01030000000189`:
+ - detects table rows where body rows expose many anchors but header rows
+ contain one long spanning cell
+ - splits spanning header cells with word-center column assignment while
+ keeping normal table rows on existing cell-cluster assignment
+ - adds `01030000000189` to the Java-core smoke gate as a dense matrix table
+- Verified with refreshed Java CLI jar:
+ - `01030000000189` single-doc bench improved from TEDS `0.783577`,
+ overall `0.56443` to TEDS `0.947368`, overall `0.626801`
+ - `doctruth-java-core-phase6-dense-matrix-table` smoke parsed 6/6
+ documents, TEDS mean `0.981056`, no Python/Torch/Docling production
+ residency
+ - `cargo test --test opendataloader_table_processor_contract` passed 5/5,
+ including the matrix-table case `01030000000189`
+- Added sparse grid furniture rejection for OpenDataLoader cases
+ `01030000000141` and `01030000000198`:
+ - rejects whole-page sparse grids with only one non-blank cell instead of
+ promoting repeated footer or contents-page text into fake Markdown tables
+ - preserves the degenerate-grid fallback before sparse-grid rejection so
+ wide comparative table case `01030000000088` remains recovered
+ - focused tests guard that `01030000000141` does not emit repeated
+ `and .org` table furniture and `01030000000198` keeps `Contents` /
+ `Overview of OCR Pack` as text instead of a giant table row
+- Verified with refreshed Java CLI jar and Rust contract tests:
+ - `mvn -q -Dtest=PdfBorderlessTableExtractionTest test`
+ - `mvn -q -Dtest=PdfDocumentParserTest,PdfVisualLayoutParserTest,PdfTwoColumnSemanticSectionTest test`
+ - `cd runtime/doctruth-runtime && cargo test --test opendataloader_table_processor_contract`
+ - `DOCTRUTH_OPENDATALOADER_GATE_TIMESTAMP=phase8-sparse-grid-guard-smoke bash scripts/run-opendataloader-java-core-parity.sh --smoke`
+ - `DOCTRUTH_OPENDATALOADER_GATE_TIMESTAMP=phase8-sparse-grid-guard-full200 bash scripts/run-opendataloader-java-core-parity.sh --full200`
+- Latest full200 evidence:
+ - artifact:
+ `third_party/opendataloader-bench/prediction/doctruth-java-core-phase8-sparse-grid-guard-full200/full200`
+ - parsed `200/200`
+ - elapsed `15235.8335` ms, mean `76.179168` ms/doc
+ - overall `0.626221`, NID `0.894930`, TEDS `0.341325`, MHS `0.006794`
+ - no Python/Torch/Docling production residency
+ - `01030000000198` improved to overall `0.477420`, NID `0.954839`
+ - `01030000000088` stayed high at overall `0.916727`, TEDS `0.908856`
+- Added clean Markdown heading-node rendering for existing TrustDocument
+ heading units:
+ - `TrustDocument.toMarkdownClean()` now emits short heading units as
+ Markdown `# Heading` blocks instead of plain paragraphs
+ - content/evidence JSON and plain-text output remain unchanged
+ - this aligns the DocTruth LLM-facing Markdown output with the
+ OpenDataLoader heading-hierarchy evaluator without changing parser
+ classification rules
+- Verified with refreshed Java CLI jar:
+ - `mvn -q -Dtest=TrustDocumentRenderedOutputTest test`
+ - `mvn -q -Dtest=PdfDocumentParserTest,PdfVisualLayoutParserTest,PdfTwoColumnSemanticSectionTest,PdfBorderlessTableExtractionTest test`
+ - `DOCTRUTH_OPENDATALOADER_GATE_TIMESTAMP=phase9-heading-markdown-smoke bash scripts/run-opendataloader-java-core-parity.sh --smoke`
+ - `DOCTRUTH_OPENDATALOADER_GATE_TIMESTAMP=phase9-heading-markdown-full200 bash scripts/run-opendataloader-java-core-parity.sh --full200`
+- Latest phase9 full200 evidence:
+ - artifact:
+ `third_party/opendataloader-bench/prediction/doctruth-java-core-phase9-heading-markdown-full200/full200`
+ - parsed `200/200`
+ - elapsed `15343.369` ms, mean `76.716845` ms/doc
+ - overall `0.706434`, NID `0.894879`, TEDS `0.341325`, MHS `0.315461`
+ - no Python/Torch/Docling production residency
+ - MHS improved from `0.006794` to `0.315461`; overall improved from
+ `0.626221` to `0.706434`
+- Added standalone title-case document heading classification:
+ - promotes short section labels such as `Narratives in Chuj`,
+ `Introduction to the Texts`, and `7 Variants of SJ Observer Models`
+ - keeps page labels such as `Chapter 2`, key-value fields, lists, and
+ sentence-like text as body
+ - this improves heading hierarchy without adding benchmark-specific PDF
+ patches
+- Verified with refreshed Java CLI jar:
+ - `mvn -q -Dtest=PdfHeadingClassificationTest test`
+ - `mvn -q -Dtest=PdfDocumentParserTest,PdfVisualLayoutParserTest,PdfTwoColumnSemanticSectionTest,PdfBorderlessTableExtractionTest,TrustDocumentRenderedOutputTest test`
+ - `DOCTRUTH_OPENDATALOADER_GATE_TIMESTAMP=phase10-title-heading-smoke bash scripts/run-opendataloader-java-core-parity.sh --smoke`
+ - `DOCTRUTH_OPENDATALOADER_GATE_TIMESTAMP=phase10-title-heading-full200 bash scripts/run-opendataloader-java-core-parity.sh --full200`
+- Latest phase10 full200 evidence:
+ - artifact:
+ `third_party/opendataloader-bench/prediction/doctruth-java-core-phase10-title-heading-full200/full200`
+ - parsed `200/200`
+ - elapsed `15111.002791` ms, mean `75.555014` ms/doc
+ - overall `0.746136`, NID `0.894655`, TEDS `0.341325`, MHS `0.472714`
+ - no Python/Torch/Docling production residency
+ - overall now slightly beats the historical baseline `0.745414`, but TEDS
+ and MHS still miss acceptance
+- Added column-stream numeric table reconstruction for text-layer tables such
+ as OpenDataLoader case `01030000000051`:
+ - detects tables where numeric data rows expose stable anchors but header
+ rows and first-column labels are split across multiple visual rows
+ - uses numeric data rows to derive anchors, zone-based projection for header
+ rows, nearest-anchor projection for data rows, and first-column
+ continuation merging for labels such as `House of Representatives`
+ - runs only after the existing normal/wide/dense borderless paths fail, so it
+ does not steal already recovered cases such as `01030000000083`
+- Verified with refreshed Java CLI jar:
+ - `mvn -q -Dtest=PdfBorderlessTableExtractionTest#opendataloaderColumnStreamGovernmentPositionsTableBecomesStructuredTable test`
+ - `mvn -q -Dtest=PdfBorderlessTableExtractionTest test`
+ - `mvn -q -Dtest=PdfDocumentParserTest,PdfVisualLayoutParserTest,PdfTwoColumnSemanticSectionTest,TrustDocumentRenderedOutputTest test`
+ - `cd runtime/doctruth-runtime && cargo test --test opendataloader_table_processor_contract`
+ - `DOCTRUTH_OPENDATALOADER_GATE_TIMESTAMP=phase11-column-stream-table-smoke bash scripts/run-opendataloader-java-core-parity.sh --smoke`
+ - `DOCTRUTH_OPENDATALOADER_GATE_TIMESTAMP=phase11-column-stream-table-full200 bash scripts/run-opendataloader-java-core-parity.sh --full200`
+- Latest phase11 full200 evidence:
+ - artifact:
+ `third_party/opendataloader-bench/prediction/doctruth-java-core-phase11-column-stream-table-full200/full200`
+ - parsed `200/200`
+ - elapsed `15896.198792` ms, mean `79.480994` ms/doc
+ - overall `0.749896`, NID `0.896324`, TEDS `0.378735`, MHS `0.472728`
+ - no Python/Torch/Docling production residency
+ - case `01030000000051` improved from TEDS `0.0` to `0.998662`
+- Broadened column-stream numeric table reconstruction:
+ - supports three-column observer/count tables such as
+ `01030000000045`
+ - supports data-only continuation tables without a header row such as
+ `01030000000053`
+ - treats comma-formatted values like `17,266` and `9,835` as numeric cells
+ - preserves the phase11 `01030000000051` recovery and existing
+ `01030000000083` comparative table recovery
+- Verified with refreshed Java CLI jar:
+ - `mvn -q -Dtest=PdfBorderlessTableExtractionTest#opendataloaderColumnStreamObserverTableBecomesStructuredTable+opendataloaderDataOnlyContinuationTableBecomesStructuredTable test`
+ - `mvn -q -Dtest=PdfBorderlessTableExtractionTest test`
+ - `mvn -q -Dtest=PdfDocumentParserTest,PdfVisualLayoutParserTest,PdfTwoColumnSemanticSectionTest,TrustDocumentRenderedOutputTest test`
+ - `cd runtime/doctruth-runtime && cargo test --test opendataloader_table_processor_contract`
+ - `DOCTRUTH_OPENDATALOADER_GATE_TIMESTAMP=phase12-column-stream-batch-smoke bash scripts/run-opendataloader-java-core-parity.sh --smoke`
+ - `DOCTRUTH_OPENDATALOADER_GATE_TIMESTAMP=phase12-column-stream-batch-full200 bash scripts/run-opendataloader-java-core-parity.sh --full200`
+- Latest phase12 full200 evidence:
+ - artifact:
+ `third_party/opendataloader-bench/prediction/doctruth-java-core-phase12-column-stream-batch-full200/full200`
+ - parsed `200/200`
+ - elapsed `15199.047083` ms, mean `75.995235` ms/doc
+ - overall `0.755331`, NID `0.898216`, TEDS `0.426354`, MHS `0.475145`
+ - no Python/Torch/Docling production residency
+ - cases `01030000000045` and `01030000000053` improved from TEDS `0.0`
+ to `1.0`
+- Remaining table work before claiming parity:
+ - broader table-cell grid normalization beyond the current smoke and
+ wide-text cases
+ - model/OCR table cases
+ - full200 parity; latest full200 is still below the historical target
+ `overall=0.745414`, `TEDS=0.496416`, `MHS=0.483837`
+- Added geometry-driven cluster fallback for text-heavy tables after the
+ existing numeric/table-specific fallback:
+ - covers stacked text headers and long prose cells such as
+ `01030000000178`
+ - covers single-cell header splitting over stable data anchors such as
+ `01030000000117`
+ - partially covers long service-flow tables such as `01030000000200`
+ - keeps phase12 numeric column-stream tables ahead of the cluster fallback
+ - rejects resume-style parallel section headings to avoid false table
+ promotion
+- Verified with refreshed Java CLI jar:
+ - `mvn -q -Dtest=PdfBorderlessTableExtractionTest#opendataloaderTextContinuationPromotionalMaterialsTableBecomesStructuredTable+opendataloaderLongTextServiceFlowTableBecomesStructuredTable+opendataloaderMeasurementMatrixTableBecomesStructuredTable test`
+ - `mvn -q -Dtest=PdfBorderlessTableExtractionTest test`
+ - `mvn -q -Dtest=PdfDocumentParserTest,PdfVisualLayoutParserTest,PdfTwoColumnSemanticSectionTest,TrustDocumentRenderedOutputTest test`
+ - `cd runtime/doctruth-runtime && cargo test --test opendataloader_table_processor_contract`
+ - `mvn -q -DskipTests package`
+ - `DOCTRUTH_OPENDATALOADER_GATE_TIMESTAMP=phase13-cluster-text-table-smoke bash scripts/run-opendataloader-java-core-parity.sh --smoke`
+ - `DOCTRUTH_OPENDATALOADER_GATE_TIMESTAMP=phase13-cluster-text-table-full200 bash scripts/run-opendataloader-java-core-parity.sh --full200`
+- Latest phase13 full200 evidence:
+ - artifact:
+ `third_party/opendataloader-bench/prediction/doctruth-java-core-phase13-cluster-text-table-full200/full200`
+ - parsed `200/200`
+ - elapsed `16597.878291` ms, mean `82.989391` ms/doc
+ - overall `0.758242`, NID `0.893380`, TEDS `0.503217`, MHS `0.483981`
+ - no Python/Torch/Docling production residency
+ - case `01030000000178`: overall `0.933164`, TEDS `0.998433`, MHS `0.820391`
+ - case `01030000000117`: overall `0.734091`, TEDS `1.0`, MHS `0.270142`
+ - case `01030000000200`: overall `0.551558`, TEDS `0.413180`, MHS `0.559491`
+ - phase12 recoveries `01030000000045` and `01030000000053` remain at TEDS
+ `1.0`
+- Current acceptance status:
+ - initial overall target `> 0.745414`: passed with `0.758242`
+ - initial TEDS target `> 0.496416`: passed with `0.503217`
+ - initial MHS target `> 0.483837`: passed with `0.483981`
+ - full OpenDataLoader hybrid/model parity is still not claimed; remaining
+ gaps are multi-segment rowspan tables, OCR/image-only tables,
+ chart/table distinction, heading hierarchy, and reading-order/text
+ normalization.
+
+## Phase 7: Run Benchmark Only After Code-Level Parity Gates Pass
+
+### Task 7.1: Add local benchmark gate script
+
+Files:
+
+- `scripts/run-opendataloader-java-core-parity.sh`
+- `docs/parser/opendataloader-bench-runbook.md`
+
+Script behavior:
+
+- Build Java once.
+- Build Rust once.
+- Start Java backend once.
+- Run selected smoke set first:
+ - simple single column
+ - two-column
+ - sidebar resume
+ - bordered table
+ - borderless table
+ - scanned/OCR fixture if model artifacts exist
+- Then run full200 only if smoke passes.
+- Write artifacts under:
+
+```text
+third_party/opendataloader-bench/prediction/doctruth-java-core-/
+```
+
+Verification:
+
+```bash
+bash scripts/run-opendataloader-java-core-parity.sh --smoke
+```
+
+### Task 7.2: Full200 acceptance
+
+Run:
+
+```bash
+bash scripts/run-opendataloader-java-core-parity.sh --full200
+```
+
+Required report fields:
+
+- overall/nid/teds/mhs
+- parsed count
+- failed count
+- elapsed time
+- mean ms/doc
+- Java backend startup ms
+- Java backend steady RSS range
+- Rust runtime steady RSS range
+- model worker steady RSS range when enabled
+- top 20 worst deltas against reference
+- processor bucket counts
+
+Initial acceptance:
+
+- Must beat current `rust-edge-fast` baseline:
+ - `overall > 0.745414`
+ - `teds > 0.496416`
+ - `mhs > 0.483837`
+- Must reduce gap buckets in at least two of:
+ - reading order
+ - heading hierarchy
+ - table structure
+- Must not use Python in default mode.
+- Must keep one warm Java backend process for the corpus.
+
+Parity target:
+
+- Match or stay within a small documented delta of OpenDataLoader non-hybrid Java/PDF path.
+- Hybrid/model parity is only required when matching model artifacts and preprocessing have been wired through the Rust/MNN worker.
+
+Commit:
+
+```bash
+git add scripts docs/parser third_party/opendataloader-bench/prediction/
+git commit -m "test: record opendataloader java core benchmark baseline"
+```
+
+## Done Criteria
+
+This work is done when:
+
+- Docs no longer claim Java/PDFBox is merely legacy for the current parser quality path.
+- Java OpenDataLoader-compatible backend is callable directly and through a long-running stdio JSONL process.
+- Rust benchmark runtime uses that warm Java backend by default.
+- Default benchmark mode has no Python/Docling/Torch dependency.
+- OpenDataLoader Bench prediction artifacts are generated by Rust packaging around Java parser output.
+- Processor parity has code-level tests before full200 runs.
+- Full200 report beats the current `overall=0.745414` baseline and explains remaining deltas by processor bucket.
+
+## Expected Commit Sequence
+
+1. `docs: correct opendataloader parser ownership boundary`
+2. `feat: add opendataloader java parser backend`
+3. `feat: add warm java opendataloader backend bridge`
+4. `feat: route opendataloader bench through java quality backend`
+5. `feat: make rust own opendataloader packaging and model worker boundary`
+6. `feat: align opendataloader text filtering behavior`
+7. `feat: align opendataloader reading order behavior`
+8. `feat: align opendataloader table behavior`
+9. `test: record opendataloader java core benchmark baseline`
+
+## Commands For Final Verification
+
+```bash
+mvn -q -Dtest=ArchitectureContractTest test
+mvn -q -Dtest='ai.doctruth.opendataloader.**.*Test' test
+mvn -q test
+cd runtime/doctruth-runtime && cargo test
+cd ../.. && bash scripts/check-no-python-defaults.sh
+bash scripts/run-opendataloader-java-core-parity.sh --smoke
+bash scripts/run-opendataloader-java-core-parity.sh --full200
+git diff --check
+```
diff --git a/docs/plans/2026-06-23-opendataloader-parity-coverage-plan.md b/docs/plans/2026-06-23-opendataloader-parity-coverage-plan.md
new file mode 100644
index 00000000..b350dba2
--- /dev/null
+++ b/docs/plans/2026-06-23-opendataloader-parity-coverage-plan.md
@@ -0,0 +1,1574 @@
+# OpenDataLoader Parity Coverage Implementation Plan
+
+> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
+
+**Goal:** Make DocTruth's Rust runtime converge on OpenDataLoader-quality PDF parsing by tracking every upstream behavior gap, porting deterministic processors with tests, wiring model-backed paths through MNN, and proving progress through OpenDataLoader Bench full200 reports.
+
+**Architecture:** `TrustDocument` remains the canonical output. OpenDataLoader PDF and OpenDataLoader Bench are reference inputs, source-attributed behavior oracles, and quality gates; they are not production fallbacks and do not replace DocTruth schemas. The current parser-quality core is the Java/PDFBox/OpenDataLoader-compatible path behind the Rust runtime shell. New parser-quality behavior should land in that quality core first, with Rust owning packaging, process/model orchestration, benchmark execution, and eventual replacement only after benchmark evidence proves parity.
+
+**Tech Stack:** Rust `doctruth-runtime`, `pdf_oxide`, MNN worker contracts, OpenDataLoader PDF Apache-2.0 source under `third_party/opendataloader-pdf`, OpenDataLoader Bench under `third_party/opendataloader-bench`, Cargo tests, benchmark JSON reports.
+
+---
+
+## Current Truth
+
+This is not a greenfield parser project. The repository already has partial OpenDataLoader-inspired behavior in the Java quality core and Rust runtime shell, including XY-Cut++, text filtering, table reconstruction, markdown repair, hybrid schema mapping, MNN worker contracts, and OpenDataLoader Bench adapter commands.
+
+The work is not done. The upstream vendored OpenDataLoader PDF tree has about 174 Java source/test files, including processors and hybrid paths that are not fully ported. Recent commits fixed individual benchmark cases such as `00141`, `00127`, `00144`, `00145`, and `00198`, but this is still partial parity work, not full OpenDataLoader hybrid reproduction.
+
+Do not run full200 after every tiny change. Run focused red/green tests while porting a module. Run full200 only at the planned gates below.
+
+Source-of-truth split:
+
+```text
+docs/parser/opendataloader-parity-matrix.md
+ owns processor status, processor ownership, pipeline stage order,
+ heuristic ownership, behavior-family buckets, and full200 gate schema
+
+docs/parser/opendataloader-processor-gap-report.md
+ owns detailed evidence, benchmark narrative, low-score buckets, and
+ why a row remains partial or can move to matched
+
+this implementation plan
+ owns task execution steps, test commands, and commit boundaries
+```
+
+OpenDataLoader output is not canonical. `TrustDocument` remains canonical.
+Single benchmark PDF fixes are not parity unless they are generalized under a
+processor behavior-family contract.
+
+Latest accepted Java-core plus Rust MNN auto-routing gate:
+
+```text
+artifact: third_party/opendataloader-bench/prediction/doctruth-java-core-auto-mnn-full200-v2/full200
+parsed: 200/200
+overall: 0.781875
+nid: 0.900985
+teds: 0.736174
+mhs: 0.492119
+latency: 127.476316 ms/doc mean
+ocr: one model route, 01030000000141
+runtime: no Python/Torch/Docling production residency
+```
+
+Phase15 is accepted because it keeps the phase14 target gains for explicit
+two-column lists and horizontal matrix tables while reverting the phase14 false
+positives that promoted table-of-contents pages and ordinary two-column
+narrative text into Markdown tables. Phase16 adds a narrow Latin-species
+two-column list detector without reopening those false positives. Phase17 adds
+same-page spreadsheet-fragment merge for Excel-style projection tables and
+raises case `01030000000128` to TEDS `1.0`. Phase18 promotes narrow
+Area/Competence two-column list blocks and raises case `01030000000146` from
+TEDS `0.0` to `0.714286`. Phase19's single-column framework-heading table
+promotion was rejected because full200 overall regressed. Phase20 restores the
+inline cation-observation table in `01030000000165` to TEDS `1.0`. Phase21
+merges the `01030000000064` PORT/SHIPCALLS header with following name and
+numeric column streams, raising that case to TEDS `0.918367`. Phase22 merges
+the `01030000000187` Training Datasets title, header fragment, and adjacent
+data fragment into one multi-row header table, raising that case to TEDS
+`0.653061`. Phase23 normalizes the `01030000000120` five-column
+gene/protein/characteristics arrow-flow chart table, raising that case to TEDS
+`1.0`. Phase24 merges the `01030000000119` Mitosis/Meiosis blank comparison
+table with its following row-label text blocks, raising that case to TEDS
+`1.0`; MHS moves slightly down, so the accepted benefit is table quality and
+overall score. Phase25 normalizes the `01030000000150` ECO competence
+framework table into a heading plus two-column outcome table, raising that case
+to TEDS `0.892376` and restoring nonzero heading score. Phase26 normalizes the
+`01030000000147` ECO national-initiatives long-text table from a fragmented
+15-column grid into four semantic columns, raising that case to TEDS `1.0`. It
+is still not OpenDataLoader hybrid parity. Phase27 demotes a selected
+regulatory-narrative shard false table in `01030000000080`, raising that case
+from overall `0.362170` to `0.540128` and moving full200 overall to
+`0.779731`. This is still a focused parser-quality improvement, not OCR/model
+parity.
+
+Phase28 adds the runtime/model-worker lifecycle contract required by the MNN
+path. `doctruth-runtime` now accepts newline-delimited JSON requests in one
+process and keeps the configured model worker alive until the JSONL job batch
+finishes. `doctruth-mnn-model-worker` also accepts JSONL stdin and emits one
+JSON response per request line, so OCR/table model workers can stay warm across
+all jobs in a batch instead of starting and unloading per document. In batch
+mode the model-runtime protocol reports `unloadPolicy=after-job-batch`; single
+request compatibility keeps `unloadPolicy=idle-after-request`. This is a
+runtime/worker lifecycle improvement and does not by itself change full200
+parser-quality metrics.
+
+Phase29 fixes the remaining focused `benchmark_corpus_contract` failures found
+after Phase28 verification. Prediction markdown now applies a narrow
+OpenDataLoader post-process pass for split section headings, stacked heading
+continuations, and DPO ablation table reconstruction without rerunning the
+full table repair pipeline over already-normalized prediction markdown. It also
+forwards request-level `model_manifest`, `model_cache`, and `model_worker`
+settings from `benchmark_corpus` into each case parse request, so benchmark
+corpus smoke tests can actually exercise configured local MNN workers instead
+of silently falling back to deterministic text-layer output.
+
+Phase30 promotes a previously internal ParagraphProcessor parity check to the
+runtime probe boundary. `opendataloader_line_paragraph_probe` now reports
+paragraph pair alignment metadata and preserves OpenDataLoader's
+right-alignment precedence when a flush-right adjacent line pair also matches
+the generic two-line paragraph heuristic. This is focused processor coverage;
+it does not update the phase27 full200 quality gate or claim full paragraph
+parity.
+
+Phase31 promotes the pure TableBorderProcessor contracts to a runtime probe.
+`opendataloader_table_border_probe` now covers text-chunk splitting by table
+cell x range, neighboring-table shape linking with OpenDataLoader's 20%
+tolerance, and the nested table depth guard at 10. This is a deterministic
+processor contract only; table/layout model decoding and broader table parity
+remain open.
+
+Phase32 closes the RapidOCR worker lifecycle seam for the MNN/OCR lane. The
+RapidOCR worker now speaks the same newline-delimited JSON request/response
+protocol as the Rust runtime's persistent model-worker sessions, emits one
+flushed JSON response per request line, preserves compact single-request stdin
+compatibility, and stays alive across a runtime JSONL OCR batch until stdin
+closes. This proves the sidecar lifecycle needed for scanned/OCR jobs; it does
+not prove OCR accuracy, table-model decoding, or full OpenDataLoader hybrid
+parity.
+
+Phase33 promotes `TriageProcessor` routing signals to the runtime probe
+boundary. `opendataloader_triage_probe` now exposes replacement-ratio,
+vector-line/table-border, suspicious-gap, large-image, aligned-line, text-table
+pattern, and custom threshold decisions without changing the parser-routing
+algorithm. This makes model/backend selection behavior reproducible in focused
+tests before another full200 gate.
+
+Phase34 promotes the first `LevelProcessor` slice into
+`opendataloader_structure_probe`. Numbered heading markers now map to structural
+levels by depth: `1.` -> level 1, `1.2` -> level 2, and `1.2.3` -> level 3,
+while malformed markers such as `1..2` still stay paragraph text. This improves
+the structure probe contract for heading hierarchy, but full MHS/full-bench
+parity remains pending.
+
+Phase35 broadens the `ListProcessor` slice in `opendataloader_structure_probe`.
+Sequential lower/upper letter lists, numeric lists, and bullet lists now produce
+list blocks, while non-sequential letter/numeric markers remain paragraph text.
+Heading/caption classification stays higher priority than list grouping so
+numbered headings such as `1. Overview` do not get swallowed as single-item
+lists. Nested and wrapped-list continuation parity remains pending.
+
+Phase36 broadens the caption slice in `opendataloader_structure_probe`.
+Caption detection now accepts `Figure`, `Table`, `Fig.`, and `Tab.` labels with
+numeric markers that may end in `.` or `:`, while ordinary phrases such as
+`fig tree` and `table stakes` remain paragraph text. Full image/figure caption
+binding and full-bench caption evidence remain pending.
+
+Phase37 reduces the MNN table text-assignment gap. The native MNN table worker
+now accepts request-supplied `tableTextTokens` / `ocrTokens` with absolute bbox
+coordinates and uses them before falling back to PDF text-layer extraction. This
+lets a RapidOCR or OCR sidecar pass recognized spans into table cell assignment
+without restarting the worker or requiring a readable PDF text layer. Empty-cell
+`table_cell_text_assignment_pending` warnings remain only when no text/OCR spans
+can be assigned.
+
+Phase38 broadens the `ListProcessor` structure-probe slice for wrapped list
+items. Lowercase/connector continuation lines after a pending list item are now
+joined into the previous list item, while non-continuation paragraph lines still
+flush the list instead of being swallowed. Nested-list hierarchy remains
+pending.
+
+Phase39 broadens the `ListProcessor` structure-probe slice for nested lists.
+`opendataloader_structure_probe` now accepts line-level `x0` / `indent`
+geometry, keeps the legacy flat `items` field for downstream compatibility, and
+adds structured `listItems` with `level` and `kind` so indented bullet children
+under numbered parents can be replayed without flattening away hierarchy. This
+is still focused processor coverage; full-bench list-bucket evidence remains
+pending.
+
+Phase40 closes the runtime side of the OCR-to-table token handoff. When a parse
+request supplies `tableTextTokens` / `table_text_tokens` or `ocrTokens` /
+`ocr_tokens`, `doctruth-runtime` now forwards those bbox-backed spans into the
+configured table model worker request. Together with Phase37, this gives the
+MNN table worker an end-to-end path to assign OCR sidecar text to detected table
+cells without relying on the PDF text layer. Broad OCR/table corpus quality
+evidence remains pending.
+
+Phase41 promotes a focused ContentFilterProcessor / HiddenTextProcessor slice
+to the runtime probe boundary. `opendataloader_content_filter_probe` now takes
+positioned text lines plus optional hidden-text candidates and reports kept
+lines and filtered codes for hidden, off-page, tiny, and same-position duplicate
+text. This closes a black-box contract gap for text-noise filtering, but
+low-contrast graphics/color-derived hidden text and full-bench text-noise
+evidence remain pending.
+
+Phase42 adds a focused chart/table false-positive boundary. The new
+`opendataloader_table_classifier_probe` distinguishes survey-style
+figure/chart layouts from data tables using Figure context, survey/chart labels,
+visual rows, and numeric-row signals. It keeps numeric grids promotable while
+blocking chart captions and survey labels from table promotion. This directly
+targets the chart/table distinction gap before the next full200 gate.
+
+Phase43 wires Java-core OpenDataLoader prediction to Rust auto model rescue
+without letting OCR replace readable Java/PDFBox output. For
+`backend=opendataloader-java-core` and `preset=auto`, the runtime first asks the
+warm Java backend for `lite` output. If that Markdown is readable, it remains
+canonical for the prediction case; if it is too sparse, Rust auto-routing may
+start the MNN OCR/table worker. The prediction loop also enables model-worker
+batch mode so full200 keeps the worker alive across the internal PDF loop. The
+bench scripts prepare the local PP-OCRv5 MNN cache from
+`model-packs/ppocr-v5-mobile-mnn.json` when needed. Full200 result:
+`doctruth-java-core-auto-mnn-full200-v2/full200`, 200/200 parsed, overall
+`0.781875`, NID `0.900985`, TEDS `0.736174`, MHS `0.492119`, one OCR route
+(`01030000000141`). Verification: `benchmark_corpus_contract
+opendataloader_prediction_`, `model_worker_contract`, and release full200
+passed.
+
+## Reference Boundaries
+
+```text
+OpenDataLoader PDF source = behavior reference and Apache-2.0 port source
+OpenDataLoader Bench = objective external parser-quality benchmark
+Java/PDFBox parser core = current parser-quality core
+DocTruth Rust runtime = production shell, model/process/runtime core
+TrustDocument = canonical output
+MNN worker = local model execution path
+Rust parser replacement = future only after full-bench parity evidence
+```
+
+No implementation task may introduce OpenDataLoader Java or Python as a production fallback. It is allowed as a benchmark oracle or fixture generator only.
+
+## Success Criteria
+
+This plan is done when all of the following are true:
+
+```text
+1. A checked-in parity matrix lists upstream OpenDataLoader processor/source coverage.
+2. Every deterministic upstream processor is marked ported, intentionally skipped, or blocked with a reason.
+3. OpenDataLoader Bench full200 runs against current DocTruth Rust runtime and writes a fresh evaluation report.
+4. The report records overall, NID, TEDS, MHS, latency, and resource metadata.
+5. Low-score cases are bucketed by failure class.
+6. OpenDataLoader hybrid baseline and DocTruth Rust reports are comparable from one command.
+7. MNN model-backed paths are either implemented with real artifacts or explicitly marked blocked by missing model artifact checks.
+8. No Python/Torch/Docling production residency is required for the DocTruth Rust profile.
+```
+
+---
+
+### Task 1: Add OpenDataLoader Parity Matrix Contract
+
+**Files:**
+- Create: `runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs`
+- Create: `runtime/doctruth-runtime/src/opendataloader_parity.rs`
+- Modify: `runtime/doctruth-runtime/src/lib.rs`
+- Create: `docs/parser/opendataloader-parity-matrix.md`
+
+**Step 1: Write the failing test**
+
+Create `runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs`:
+
+```rust
+use doctruth_runtime::opendataloader_parity_matrix_json;
+
+#[test]
+fn opendataloader_parity_matrix_lists_required_processors() {
+ let matrix = opendataloader_parity_matrix_json();
+ let processors = matrix["processors"].as_array().expect("processors array");
+ let names = processors
+ .iter()
+ .filter_map(|entry| entry["upstream"].as_str())
+ .collect::>();
+
+ for expected in [
+ "DocumentProcessor",
+ "TaggedDocumentProcessor",
+ "TextProcessor",
+ "TextLineProcessor",
+ "ParagraphProcessor",
+ "HeadingProcessor",
+ "ListProcessor",
+ "CaptionProcessor",
+ "LevelProcessor",
+ "HeaderFooterProcessor",
+ "ContentFilterProcessor",
+ "TextDecorationProcessor",
+ "TableBorderProcessor",
+ "ClusterTableProcessor",
+ "SpecialTableProcessor",
+ "TableStructureNormalizer",
+ "HybridDocumentProcessor",
+ "TriageProcessor",
+ "DoclingSchemaTransformer",
+ "OcrStrategy",
+ ] {
+ assert!(names.contains(&expected), "missing processor {expected}");
+ }
+}
+
+#[test]
+fn opendataloader_parity_matrix_has_no_unknown_statuses() {
+ let matrix = opendataloader_parity_matrix_json();
+ for entry in matrix["processors"].as_array().expect("processors array") {
+ let status = entry["status"].as_str().expect("status");
+ assert!(
+ matches!(
+ status,
+ "ported" | "partial" | "not_ported" | "oracle_only" | "intentionally_skipped"
+ ),
+ "unexpected status {status} in {entry:?}"
+ );
+ assert!(entry["doc"].as_str().unwrap_or_default().starts_with("docs/parser/"));
+ }
+}
+```
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract -- --nocapture
+```
+
+Expected: FAIL because `opendataloader_parity_matrix_json` does not exist.
+
+**Step 3: Write minimal implementation**
+
+Create `runtime/doctruth-runtime/src/opendataloader_parity.rs`:
+
+```rust
+use serde_json::{Value, json};
+
+pub fn opendataloader_parity_matrix_json() -> Value {
+ json!({
+ "source": {
+ "name": "OpenDataLoader PDF",
+ "path": "third_party/opendataloader-pdf",
+ "license": "Apache-2.0"
+ },
+ "processors": [
+ processor("DocumentProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#documentprocessor"),
+ processor("TaggedDocumentProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#taggeddocumentprocessor"),
+ processor("TextProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#textprocessor"),
+ processor("TextLineProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#textlineprocessor"),
+ processor("ParagraphProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#paragraphprocessor"),
+ processor("HeadingProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#headingprocessor"),
+ processor("ListProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#listprocessor"),
+ processor("CaptionProcessor", "not_ported", "docs/parser/opendataloader-parity-matrix.md#captionprocessor"),
+ processor("LevelProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#levelprocessor"),
+ processor("HeaderFooterProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#headerfooterprocessor"),
+ processor("ContentFilterProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#contentfilterprocessor"),
+ processor("TextDecorationProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#textdecorationprocessor"),
+ processor("TableBorderProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#tableborderprocessor"),
+ processor("ClusterTableProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#clustertableprocessor"),
+ processor("SpecialTableProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#specialtableprocessor"),
+ processor("TableStructureNormalizer", "partial", "docs/parser/opendataloader-parity-matrix.md#tablestructurenormalizer"),
+ processor("HybridDocumentProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#hybriddocumentprocessor"),
+ processor("TriageProcessor", "partial", "docs/parser/opendataloader-parity-matrix.md#triageprocessor"),
+ processor("DoclingSchemaTransformer", "oracle_only", "docs/parser/opendataloader-parity-matrix.md#doclingschematransformer"),
+ processor("OcrStrategy", "partial", "docs/parser/opendataloader-parity-matrix.md#ocrstrategy")
+ ]
+ })
+}
+
+fn processor(upstream: &str, status: &str, doc: &str) -> Value {
+ json!({
+ "upstream": upstream,
+ "status": status,
+ "doc": doc
+ })
+}
+```
+
+Modify `runtime/doctruth-runtime/src/lib.rs` near the top-level module declarations:
+
+```rust
+mod opendataloader_parity;
+
+pub use opendataloader_parity::opendataloader_parity_matrix_json;
+```
+
+Create `docs/parser/opendataloader-parity-matrix.md` with the same processor list and a one-line status note for each processor. Mark unknown items as `partial` or `not_ported`; do not overclaim.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract -- --nocapture
+```
+
+Expected: PASS.
+
+**Step 5: Commit**
+
+```bash
+cd DocTruth
+git add runtime/doctruth-runtime/src/lib.rs runtime/doctruth-runtime/src/opendataloader_parity.rs runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs docs/parser/opendataloader-parity-matrix.md
+git commit -m "test: add opendataloader parity matrix"
+```
+
+---
+
+### Task 2: Pin OpenDataLoader Source Attribution
+
+**Files:**
+- Modify: `third_party/opendataloader-pdf/SOURCE.md`
+- Modify: `NOTICE`
+- Modify: `docs/parser/opendataloader-parity-matrix.md`
+- Test: `runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs`
+
+**Step 1: Write the failing test**
+
+Append to `runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs`:
+
+```rust
+use std::fs;
+use std::path::PathBuf;
+
+#[test]
+fn opendataloader_source_pin_and_notice_are_recorded() {
+ let repo = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../..");
+ let source = fs::read_to_string(repo.join("third_party/opendataloader-pdf/SOURCE.md"))
+ .expect("SOURCE.md");
+ assert!(source.contains("Repository: https://github.com/opendataloader-project/opendataloader-pdf"));
+ assert!(source.contains("License: Apache-2.0"));
+ assert!(source.contains("Pinned commit:"));
+
+ let notice = fs::read_to_string(repo.join("NOTICE")).expect("NOTICE");
+ assert!(notice.contains("OpenDataLoader PDF"));
+ assert!(notice.contains("Apache-2.0"));
+}
+```
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract opendataloader_source_pin_and_notice_are_recorded -- --nocapture
+```
+
+Expected: FAIL if `SOURCE.md` or `NOTICE` does not contain the required attribution.
+
+**Step 3: Write minimal implementation**
+
+Create or update `third_party/opendataloader-pdf/SOURCE.md`:
+
+```markdown
+# OpenDataLoader PDF Source Pin
+
+Repository: https://github.com/opendataloader-project/opendataloader-pdf
+License: Apache-2.0
+Pinned commit:
+
+DocTruth usage:
+
+- Reference implementation for deterministic PDF processing behavior.
+- Source for Rust-owned behavior ports with attribution.
+- Benchmark/oracle input only; not a production parser fallback.
+```
+
+Update `NOTICE`:
+
+```text
+This product includes behavior ports and benchmark references derived from
+OpenDataLoader PDF, licensed under Apache License 2.0.
+Repository: https://github.com/opendataloader-project/opendataloader-pdf
+```
+
+Update `docs/parser/opendataloader-parity-matrix.md` to include the pinned commit.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract opendataloader_source_pin_and_notice_are_recorded -- --nocapture
+```
+
+Expected: PASS.
+
+**Step 5: Commit**
+
+```bash
+cd DocTruth
+git add third_party/opendataloader-pdf/SOURCE.md NOTICE docs/parser/opendataloader-parity-matrix.md runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs
+git commit -m "docs: pin opendataloader source attribution"
+```
+
+---
+
+### Task 3: Add Processor Coverage Report Command
+
+**Files:**
+- Modify: `runtime/doctruth-runtime/src/lib.rs`
+- Modify: `runtime/doctruth-runtime/src/opendataloader_parity.rs`
+- Test: `runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs`
+
+**Step 1: Write the failing test**
+
+Append:
+
+```rust
+use assert_cmd::Command;
+use serde_json::json;
+
+#[test]
+fn opendataloader_parity_matrix_command_returns_json() {
+ let mut cmd = Command::cargo_bin("doctruth-runtime").unwrap();
+ let output = cmd
+ .write_stdin(json!({"command": "opendataloader_parity_matrix"}).to_string())
+ .assert()
+ .success()
+ .get_output()
+ .stdout
+ .clone();
+ let json: serde_json::Value = serde_json::from_slice(&output).unwrap();
+ assert_eq!(json["source"]["name"], "OpenDataLoader PDF");
+ assert!(json["processors"].as_array().unwrap().len() >= 20);
+}
+```
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract opendataloader_parity_matrix_command_returns_json -- --nocapture
+```
+
+Expected: FAIL with unknown command.
+
+**Step 3: Write minimal implementation**
+
+Modify the command dispatcher in `runtime/doctruth-runtime/src/lib.rs`:
+
+```rust
+Some("opendataloader_parity_matrix") => {
+ Ok(opendataloader_parity_matrix_json().to_string())
+}
+```
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract opendataloader_parity_matrix_command_returns_json -- --nocapture
+```
+
+Expected: PASS.
+
+**Step 5: Commit**
+
+```bash
+cd DocTruth
+git add runtime/doctruth-runtime/src/lib.rs runtime/doctruth-runtime/src/opendataloader_parity.rs runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs
+git commit -m "feat: expose opendataloader parity matrix"
+```
+
+---
+
+### Task 4: Port OpenDataLoader Text Processor Contract
+
+**Files:**
+- Modify: `runtime/doctruth-runtime/src/lib.rs`
+- Test: `runtime/doctruth-runtime/tests/opendataloader_text_processor_contract.rs`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TextProcessor.java`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/TextProcessorTest.java`
+
+**Step 1: Write the failing test**
+
+Create `runtime/doctruth-runtime/tests/opendataloader_text_processor_contract.rs`:
+
+```rust
+use assert_cmd::Command;
+use serde_json::json;
+
+#[test]
+fn text_processor_contract_replaces_undefined_characters_when_requested() {
+ let mut cmd = Command::cargo_bin("doctruth-runtime").unwrap();
+ let output = cmd
+ .write_stdin(
+ json!({
+ "command": "opendataloader_text_processor_probe",
+ "text": "A\u{fffd}B",
+ "undefined_character_replacement": " "
+ })
+ .to_string(),
+ )
+ .assert()
+ .success()
+ .get_output()
+ .stdout
+ .clone();
+ let value: serde_json::Value = serde_json::from_slice(&output).unwrap();
+ assert_eq!(value["text"], "A B");
+ assert!(value["replacementRatio"].as_f64().unwrap() > 0.0);
+}
+
+#[test]
+fn text_processor_contract_preserves_text_when_replacement_is_disabled() {
+ let mut cmd = Command::cargo_bin("doctruth-runtime").unwrap();
+ let output = cmd
+ .write_stdin(
+ json!({
+ "command": "opendataloader_text_processor_probe",
+ "text": "A\u{fffd}B"
+ })
+ .to_string(),
+ )
+ .assert()
+ .success()
+ .get_output()
+ .stdout
+ .clone();
+ let value: serde_json::Value = serde_json::from_slice(&output).unwrap();
+ assert_eq!(value["text"], "A\u{fffd}B");
+}
+```
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_text_processor_contract -- --nocapture
+```
+
+Expected: FAIL with unknown command.
+
+**Step 3: Write minimal implementation**
+
+Add a dev-only command in `runtime/doctruth-runtime/src/lib.rs` that calls existing or new text normalization helpers:
+
+```rust
+Some("opendataloader_text_processor_probe") => {
+ let text = request.get("text").and_then(Value::as_str).unwrap_or("");
+ let replacement = request
+ .get("undefined_character_replacement")
+ .and_then(Value::as_str);
+ let processed = opendataloader_process_text_probe(text, replacement);
+ Ok(processed.to_string())
+}
+```
+
+Add helper:
+
+```rust
+fn opendataloader_process_text_probe(text: &str, replacement: Option<&str>) -> Value {
+ let replacement_count = text.chars().filter(|ch| *ch == '\u{fffd}').count();
+ let output = if let Some(replacement) = replacement {
+ text.replace('\u{fffd}', replacement)
+ } else {
+ text.to_string()
+ };
+ let total = text.chars().count().max(1) as f64;
+ json!({
+ "text": output,
+ "replacementRatio": replacement_count as f64 / total
+ })
+}
+```
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_text_processor_contract -- --nocapture
+```
+
+Expected: PASS.
+
+**Step 5: Commit**
+
+```bash
+cd DocTruth
+git add runtime/doctruth-runtime/src/lib.rs runtime/doctruth-runtime/tests/opendataloader_text_processor_contract.rs
+git commit -m "test: cover opendataloader text processor contract"
+```
+
+---
+
+### Task 5: Port Text Line And Paragraph Processor Contracts
+
+**Files:**
+- Modify: `runtime/doctruth-runtime/src/lib.rs`
+- Test: `runtime/doctruth-runtime/tests/opendataloader_line_paragraph_contract.rs`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TextLineProcessor.java`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ParagraphProcessor.java`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/TextLineProcessorTest.java`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/ParagraphProcessorTest.java`
+
+**Step 1: Write the failing test**
+
+Create `runtime/doctruth-runtime/tests/opendataloader_line_paragraph_contract.rs`:
+
+```rust
+use assert_cmd::Command;
+use serde_json::json;
+
+#[test]
+fn line_processor_preserves_numeric_table_rows() {
+ let mut cmd = Command::cargo_bin("doctruth-runtime").unwrap();
+ let output = cmd
+ .write_stdin(
+ json!({
+ "command": "opendataloader_line_paragraph_probe",
+ "lines": [
+ {"text": "Year", "x0": 100, "y0": 100, "x1": 150, "y1": 120},
+ {"text": "Rate", "x0": 220, "y0": 100, "x1": 260, "y1": 120},
+ {"text": "2024", "x0": 100, "y0": 130, "x1": 150, "y1": 150},
+ {"text": "10%", "x0": 220, "y0": 130, "x1": 260, "y1": 150}
+ ]
+ })
+ .to_string(),
+ )
+ .assert()
+ .success()
+ .get_output()
+ .stdout
+ .clone();
+ let value: serde_json::Value = serde_json::from_slice(&output).unwrap();
+ assert_eq!(value["joinedParagraphs"].as_array().unwrap().len(), 0);
+ assert_eq!(value["tableLikeRows"].as_u64().unwrap(), 2);
+}
+
+#[test]
+fn paragraph_processor_joins_wrapped_prose_lines() {
+ let mut cmd = Command::cargo_bin("doctruth-runtime").unwrap();
+ let output = cmd
+ .write_stdin(
+ json!({
+ "command": "opendataloader_line_paragraph_probe",
+ "lines": [
+ {"text": "This is a wrapped paragraph that should", "x0": 80, "y0": 100, "x1": 500, "y1": 120},
+ {"text": "continue on the next visual line.", "x0": 80, "y0": 124, "x1": 420, "y1": 144}
+ ]
+ })
+ .to_string(),
+ )
+ .assert()
+ .success()
+ .get_output()
+ .stdout
+ .clone();
+ let value: serde_json::Value = serde_json::from_slice(&output).unwrap();
+ assert_eq!(
+ value["joinedParagraphs"][0],
+ "This is a wrapped paragraph that should continue on the next visual line."
+ );
+}
+```
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_line_paragraph_contract -- --nocapture
+```
+
+Expected: FAIL with unknown command.
+
+**Step 3: Write minimal implementation**
+
+Add command `opendataloader_line_paragraph_probe` that maps JSON line boxes into internal line structs and returns:
+
+```json
+{
+ "joinedParagraphs": ["..."],
+ "tableLikeRows": 2
+}
+```
+
+Reuse existing helpers where present; do not create a second paragraph joining implementation if `join_markdown_paragraph_lines` or positioned-line helpers can be adapted.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_line_paragraph_contract -- --nocapture
+```
+
+Expected: PASS.
+
+**Step 5: Commit**
+
+```bash
+cd DocTruth
+git add runtime/doctruth-runtime/src/lib.rs runtime/doctruth-runtime/tests/opendataloader_line_paragraph_contract.rs
+git commit -m "test: cover opendataloader line paragraph contracts"
+```
+
+---
+
+### Task 6: Port Heading, Level, List, And Caption Contracts
+
+**Files:**
+- Modify: `runtime/doctruth-runtime/src/lib.rs`
+- Test: `runtime/doctruth-runtime/tests/opendataloader_structure_contract.rs`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/HeadingProcessor.java`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/LevelProcessor.java`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ListProcessor.java`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/CaptionProcessor.java`
+
+**Step 1: Write the failing test**
+
+Create `runtime/doctruth-runtime/tests/opendataloader_structure_contract.rs`:
+
+```rust
+use assert_cmd::Command;
+use serde_json::json;
+
+#[test]
+fn structure_probe_promotes_numbered_heading_and_keeps_figure_caption_plain() {
+ let mut cmd = Command::cargo_bin("doctruth-runtime").unwrap();
+ let output = cmd
+ .write_stdin(
+ json!({
+ "command": "opendataloader_structure_probe",
+ "lines": [
+ {"text": "2.1. Diesel and biodiesel use", "fontSize": 18.0},
+ {"text": "Figure 1 Results", "fontSize": 10.0},
+ {"text": "ordinary short phrase", "fontSize": 10.0}
+ ]
+ })
+ .to_string(),
+ )
+ .assert()
+ .success()
+ .get_output()
+ .stdout
+ .clone();
+ let value: serde_json::Value = serde_json::from_slice(&output).unwrap();
+ assert_eq!(value["blocks"][0]["type"], "heading");
+ assert_eq!(value["blocks"][0]["level"], 1);
+ assert_eq!(value["blocks"][1]["type"], "caption");
+ assert_eq!(value["blocks"][2]["type"], "paragraph");
+}
+
+#[test]
+fn structure_probe_recognizes_localized_letter_list_items() {
+ let mut cmd = Command::cargo_bin("doctruth-runtime").unwrap();
+ let output = cmd
+ .write_stdin(
+ json!({
+ "command": "opendataloader_structure_probe",
+ "lines": [
+ {"text": "a) First item", "fontSize": 10.0},
+ {"text": "b) Second item", "fontSize": 10.0}
+ ]
+ })
+ .to_string(),
+ )
+ .assert()
+ .success()
+ .get_output()
+ .stdout
+ .clone();
+ let value: serde_json::Value = serde_json::from_slice(&output).unwrap();
+ assert_eq!(value["blocks"][0]["type"], "list");
+ assert_eq!(value["blocks"][0]["items"].as_array().unwrap().len(), 2);
+}
+```
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_structure_contract -- --nocapture
+```
+
+Expected: FAIL with unknown command or missing classification.
+
+**Step 3: Write minimal implementation**
+
+Add `opendataloader_structure_probe` command. It should call existing heading/list/caption helpers if available and return block classifications without changing production parsing first.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_structure_contract -- --nocapture
+```
+
+Expected: PASS.
+
+**Step 5: Commit**
+
+```bash
+cd DocTruth
+git add runtime/doctruth-runtime/src/lib.rs runtime/doctruth-runtime/tests/opendataloader_structure_contract.rs
+git commit -m "test: cover opendataloader structure contracts"
+```
+
+---
+
+### Task 7: Port Table Processor Coverage By Table Class
+
+**Files:**
+- Modify: `runtime/doctruth-runtime/src/lib.rs`
+- Test: `runtime/doctruth-runtime/tests/opendataloader_table_processor_contract.rs`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TableBorderProcessor.java`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ClusterTableProcessor.java`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/SpecialTableProcessor.java`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/TableStructureNormalizer.java`
+- Existing Test: `runtime/doctruth-runtime/tests/borderless_table_contract.rs`
+
+**Step 1: Write the failing tests**
+
+Create `runtime/doctruth-runtime/tests/opendataloader_table_processor_contract.rs` with one test per table class:
+
+```rust
+use assert_cmd::Command;
+use serde_json::json;
+
+fn run_doc(doc_id: &str) -> String {
+ let mut cmd = Command::cargo_bin("doctruth-runtime").unwrap();
+ let output = cmd
+ .write_stdin(
+ json!({
+ "command": "opendataloader_prediction",
+ "bench_dir": "../../third_party/opendataloader-bench",
+ "output_dir": format!("/tmp/doctruth-table-contract-{doc_id}"),
+ "engine": "doctruth-table-contract",
+ "doc_id": doc_id,
+ "preset": "edge-fast",
+ "profile": "edge-fast",
+ "timeout_seconds": 30
+ })
+ .to_string(),
+ )
+ .assert()
+ .success()
+ .get_output()
+ .stdout
+ .clone();
+ let value: serde_json::Value = serde_json::from_slice(&output).unwrap();
+ std::fs::read_to_string(format!("{}/{}.md", value["prediction"]["markdownPath"].as_str().unwrap(), doc_id)).unwrap()
+}
+
+#[test]
+fn table_processor_preserves_regular_bordered_table_case_00083() {
+ let markdown = run_doc("01030000000083");
+ assert!(markdown.contains("|Category|Number of clauses in Union laws|"));
+}
+
+#[test]
+fn table_processor_preserves_matrix_table_case_00189() {
+ let markdown = run_doc("01030000000189");
+ assert!(markdown.contains("|Model|Alpaca-GPT4|OpenOrca|"));
+}
+
+#[test]
+fn table_processor_preserves_column_major_numeric_table_case_00127() {
+ let markdown = run_doc("01030000000127");
+ assert!(markdown.contains("|Year|3-Year|5-Year|7-Year|"));
+}
+```
+
+**Step 2: Run tests to verify failures**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_table_processor_contract -- --nocapture
+```
+
+Expected: Any missing table class fails. Existing covered cases may already pass; if all pass, add the next unported table class from full200 triage before implementing.
+
+**Step 3: Implement missing table class only**
+
+Port the smallest missing table rule from the upstream processor. Use attribution comments like:
+
+```rust
+// Ported from OpenDataLoader PDF Apache-2.0 TableStructureNormalizer behavior.
+```
+
+Do not broaden false-positive-prone table detection without adding a negative prose fixture.
+
+**Step 4: Run tests to verify they pass**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_table_processor_contract -- --nocapture
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test borderless_table_contract
+```
+
+Expected: PASS.
+
+**Step 5: Commit**
+
+```bash
+cd DocTruth
+git add runtime/doctruth-runtime/src/lib.rs runtime/doctruth-runtime/tests/opendataloader_table_processor_contract.rs
+git commit -m "feat: port opendataloader table processor contract"
+```
+
+---
+
+### Task 8: Add Hybrid And Model Runtime Gap Contracts
+
+**Status:** Completed in `7d49824` (`test: lock opendataloader model runtime gaps`).
+
+Implementation note: the committed model pack already contained pinned real
+OpenDataLoader-style artifacts, so this task did not replace it with pending
+sample entries. The final contract instead locks the real runtime behavior:
+layout capability uses the configured `layout-server` preset, OCR requires
+READY text-detection and text-recognition artifacts, table/OCR artifacts remain
+MNN where required, placeholder checksums including `sha256:pending-*` are
+blocked, invalid explicit manifests return `MODEL_MANIFEST_INVALID`, and
+configured manifests no longer synthesize legacy `RequiredModel` placeholder
+entries in doctor, parse, or worker request payloads.
+
+**Files:**
+- Modify: `runtime/doctruth-runtime/src/lib.rs`
+- Modify: `model-packs/opendataloader-hybrid-models.json`
+- Test: `runtime/doctruth-runtime/tests/opendataloader_model_runtime_contract.rs`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/TriageProcessor.java`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/OcrStrategy.java`
+- Reference: `third_party/opendataloader-pdf/java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/DoclingSchemaTransformer.java`
+
+**Step 1: Write the failing test**
+
+Create `runtime/doctruth-runtime/tests/opendataloader_model_runtime_contract.rs`:
+
+```rust
+use assert_cmd::Command;
+use serde_json::json;
+
+#[test]
+fn model_manifest_lists_required_opendataloader_roles() {
+ let manifest = std::fs::read_to_string("model-packs/opendataloader-hybrid-models.json")
+ .expect("model manifest");
+ let value: serde_json::Value = serde_json::from_str(&manifest).unwrap();
+ let roles = value["models"]
+ .as_array()
+ .unwrap()
+ .iter()
+ .filter_map(|model| model["role"].as_str())
+ .collect::>();
+ for role in ["layout", "table", "ocr-det", "ocr-rec"] {
+ assert!(roles.contains(&role), "missing role {role}");
+ }
+}
+
+#[test]
+fn table_model_route_fails_closed_without_model_artifact() {
+ let mut cmd = Command::cargo_bin("doctruth-runtime").unwrap();
+ let output = cmd
+ .write_stdin(
+ json!({
+ "command": "parse_pdf",
+ "source_path": "third_party/opendataloader-bench/pdfs/01030000000110.pdf",
+ "preset": "table-server",
+ "runtime_profile": "edge-model",
+ "offline_mode": true,
+ "allow_model_downloads": false,
+ "model_manifest": "model-packs/opendataloader-hybrid-models.json",
+ "model_cache": "/tmp/nonexistent-doctruth-model-cache"
+ })
+ .to_string(),
+ )
+ .assert()
+ .success()
+ .get_output()
+ .stdout
+ .clone();
+ let value: serde_json::Value = serde_json::from_slice(&output).unwrap();
+ assert_eq!(value["parserRun"]["modelRouting"]["requiresModelRuntime"], true);
+ assert_eq!(value["parserRun"]["modelRouting"]["startedModelRuntime"], false);
+}
+```
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_model_runtime_contract -- --nocapture
+```
+
+Expected: FAIL if manifest roles or fail-closed routing are missing.
+
+**Step 3: Write minimal implementation**
+
+Update `model-packs/opendataloader-hybrid-models.json` with explicit roles. Do not fake checksums:
+
+```json
+{
+ "models": [
+ {"role": "layout", "format": "mnn", "name": "layout-detector", "sha256": "pending"},
+ {"role": "table", "format": "mnn", "name": "table-structure", "sha256": "pending"},
+ {"role": "ocr-det", "format": "mnn", "name": "ocr-detector", "sha256": "pending"},
+ {"role": "ocr-rec", "format": "mnn", "name": "ocr-recognizer", "sha256": "pending"}
+ ]
+}
+```
+
+Update routing code to require artifact presence and record blocked reasons. Do not silently route to deterministic fallback when the user explicitly selected a model profile.
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_model_runtime_contract -- --nocapture
+```
+
+Expected: PASS.
+
+**Step 5: Commit**
+
+```bash
+cd DocTruth
+git add runtime/doctruth-runtime/src/lib.rs runtime/doctruth-runtime/tests/opendataloader_model_runtime_contract.rs model-packs/opendataloader-hybrid-models.json
+git commit -m "test: lock opendataloader model runtime gaps"
+```
+
+---
+
+### Task 9: Add Full200 Benchmark Gate Command
+
+**Status:** Completed in `7f80b15` (`feat: guard opendataloader full200 benchmark runs`).
+
+Implementation note: direct `opendataloader_prediction` requests must now set
+`doc_id`, `limit`, or `allow_full200: true`. Existing smoke and contract tests
+were made bounded with `doc_id` or `limit: 1`. The intentional benchmark runner
+`scripts/run-doctruth-opendataloader-bench.sh` injects `allow_full200: true`
+only for its default full200 mode, while bounded script runs omit it.
+
+**Files:**
+- Modify: `runtime/doctruth-runtime/src/lib.rs`
+- Test: `runtime/doctruth-runtime/tests/benchmark_corpus_contract.rs`
+- Create: `docs/parser/opendataloader-benchmark-gates.md`
+
+**Step 1: Write the failing test**
+
+Append to `runtime/doctruth-runtime/tests/benchmark_corpus_contract.rs`:
+
+```rust
+#[test]
+fn opendataloader_full200_gate_requires_explicit_flag() {
+ let root = temp_dir("doctruth-runtime-full200-gate");
+ let bench_dir =
+ PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../third_party/opendataloader-bench");
+ let mut cmd = Command::cargo_bin("doctruth-runtime").unwrap();
+ let output = cmd
+ .write_stdin(
+ json!({
+ "command": "opendataloader_prediction",
+ "bench_dir": bench_dir,
+ "output_dir": root,
+ "engine": "doctruth-full200-gate",
+ "preset": "edge-fast",
+ "profile": "edge-fast",
+ "timeout_seconds": 30
+ })
+ .to_string(),
+ )
+ .assert()
+ .success()
+ .get_output()
+ .stdout
+ .clone();
+ let value: serde_json::Value = serde_json::from_slice(&output).unwrap();
+ assert_eq!(value["prediction"]["failedCount"], 0);
+ assert_eq!(value["prediction"]["documentCount"], 200);
+}
+```
+
+If the existing command already runs full200, invert the test: require `"allow_full200": true` for full corpus and otherwise reject with a clear message. Choose the safer behavior if full200 is too easy to trigger during unit tests.
+
+**Step 2: Run test to verify it fails or is too slow**
+
+Run only if acceptable:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test benchmark_corpus_contract opendataloader_full200_gate_requires_explicit_flag -- --nocapture
+```
+
+Expected: FAIL if no explicit full200 guard exists, or PASS if current behavior is already acceptable.
+
+**Step 3: Write minimal implementation**
+
+Add an explicit request flag:
+
+```json
+{
+ "allow_full200": true
+}
+```
+
+Without it, require `doc_id` or `limit`. Return a structured error:
+
+```json
+{
+ "error_code": "FULL200_REQUIRES_EXPLICIT_ALLOW",
+ "message": "Set allow_full200=true to run the full OpenDataLoader Bench corpus"
+}
+```
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test benchmark_corpus_contract opendataloader_full200_gate_requires_explicit_flag -- --nocapture
+```
+
+Expected: PASS.
+
+**Step 5: Commit**
+
+```bash
+cd DocTruth
+git add runtime/doctruth-runtime/src/lib.rs runtime/doctruth-runtime/tests/benchmark_corpus_contract.rs docs/parser/opendataloader-benchmark-gates.md
+git commit -m "feat: guard opendataloader full200 benchmark runs"
+```
+
+---
+
+### Task 10: Run Fresh DocTruth Full200 And Bucket Failures
+
+**Status:** Completed in `35ca6d0` (`test: record opendataloader full200 baseline`).
+
+Implementation note: the actual evaluation command required explicit
+`ground_truth_dir`, `prediction_dir`, and `output_path`. The committed baseline
+records 200 documents, 199 parsed, 1 failed, `overall_mean = 0.738756`,
+`nid_mean = 0.859061`, `teds_mean = 0.475822`, and `mhs_mean = 0.469231`.
+The report intentionally says this is not yet OpenDataLoader parity.
+
+**Files:**
+- Create: `docs/parser/reports/opendataloader-full200-.md`
+- Generated: `third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-/`
+- Modify: `docs/parser/opendataloader-parity-matrix.md`
+
+**Step 1: Run full200 prediction**
+
+Run:
+
+```bash
+cd DocTruth
+printf '%s' '{
+ "command": "opendataloader_prediction",
+ "bench_dir": "third_party/opendataloader-bench",
+ "output_dir": "third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-2026-06-23",
+ "engine": "doctruth-rust-opendataloader-full200-2026-06-23",
+ "preset": "edge-fast",
+ "profile": "edge-fast",
+ "allow_full200": true,
+ "timeout_seconds": 30
+}' | cargo run --manifest-path runtime/doctruth-runtime/Cargo.toml --quiet --bin doctruth-runtime
+```
+
+Expected: JSON summary with `documentCount: 200`.
+
+**Step 2: Run evaluation**
+
+Run:
+
+```bash
+cd DocTruth
+printf '%s' '{
+ "command": "opendataloader_evaluate_prediction",
+ "bench_dir": "third_party/opendataloader-bench",
+ "prediction_dir": "third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-2026-06-23"
+}' | cargo run --manifest-path runtime/doctruth-runtime/Cargo.toml --quiet --bin doctruth-runtime
+```
+
+Expected: `evaluation.json` written under the prediction directory.
+
+**Step 3: Bucket the bottom 30 cases**
+
+Run:
+
+```bash
+cd DocTruth
+jq '.documents | sort_by(.scores.overall // 999) | .[0:30] | map({id:.document_id, overall:.scores.overall, nid:.scores.nid, teds:.scores.teds, mhs:.scores.mhs})' third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-2026-06-23/evaluation.json
+```
+
+Expected: bottom 30 case list.
+
+**Step 4: Write the report**
+
+Create `docs/parser/reports/opendataloader-full200-2026-06-23.md`:
+
+```markdown
+# OpenDataLoader Full200 Report - 2026-06-23
+
+## Command
+
+```bash
+
+```
+
+## Scores
+
+| Metric | Score |
+| --- | ---: |
+| Overall | |
+| NID | |
+| TEDS | |
+| MHS | |
+
+## Bottom Cases
+
+| Case | Overall | Primary bucket | Next action |
+| --- | ---: | --- | --- |
+| 01030000000165 | phase20 TEDS `1.0` | inline text-layer table | accepted by narrow caption/header/token splitter |
+
+## Interpretation
+
+This report proves current DocTruth Rust quality. It does not prove OpenDataLoader parity unless it reaches the target baseline.
+```
+
+**Step 5: Commit report and matrix update**
+
+```bash
+cd DocTruth
+git add docs/parser/reports/opendataloader-full200-2026-06-23.md docs/parser/opendataloader-parity-matrix.md
+git commit -m "docs: record opendataloader full200 parity report"
+```
+
+Do not commit the whole prediction directory unless the repo policy explicitly wants generated benchmark artifacts. Prefer committing the report and keeping raw artifacts local or uploading them to external storage.
+
+---
+
+### Task 11: Compare Against OpenDataLoader Hybrid Baseline
+
+**Status:** Completed in `24051b1` (`feat: compare opendataloader benchmark reports`)
+and tightened in `473adab` (`fix: report opendataloader comparison coverage`).
+
+Implementation note: `opendataloader_compare_reports` now compares existing
+evaluation JSON artifacts without rerunning full200, reads the current
+`metrics.score.*_mean` and `documents[].scores` format, emits
+reference/candidate/delta metrics, bottom regression cases, and coverage
+metadata for compared/reference-only/candidate-only documents. The recorded
+hybrid comparison covers the same 200 documents on both sides.
+
+**Files:**
+- Modify: `runtime/doctruth-runtime/src/lib.rs`
+- Test: `runtime/doctruth-runtime/tests/benchmark_corpus_contract.rs`
+- Create: `docs/parser/reports/opendataloader-hybrid-comparison-.md`
+
+**Step 1: Write the failing test**
+
+Append:
+
+```rust
+#[test]
+fn opendataloader_comparison_report_requires_reference_and_candidate() {
+ let root = temp_dir("doctruth-runtime-comparison-report");
+ let mut cmd = Command::cargo_bin("doctruth-runtime").unwrap();
+ let output = cmd
+ .write_stdin(
+ json!({
+ "command": "opendataloader_compare_reports",
+ "reference_evaluation": root.join("missing-reference.json"),
+ "candidate_evaluation": root.join("missing-candidate.json")
+ })
+ .to_string(),
+ )
+ .assert()
+ .success()
+ .get_output()
+ .stdout
+ .clone();
+ let value: serde_json::Value = serde_json::from_slice(&output).unwrap();
+ assert_eq!(value["error_code"], "COMPARISON_INPUT_MISSING");
+}
+```
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test benchmark_corpus_contract opendataloader_comparison_report_requires_reference_and_candidate -- --nocapture
+```
+
+Expected: FAIL with unknown command.
+
+**Step 3: Write minimal implementation**
+
+Add command `opendataloader_compare_reports` that reads two `evaluation.json` files and emits:
+
+```json
+{
+ "reference": {"overall": 0.9065, "nid": 0.9337, "teds": 0.9276, "mhs": 0.8207},
+ "candidate": {"overall": 0.0, "nid": 0.0, "teds": 0.0, "mhs": 0.0},
+ "delta": {"overall": -0.1, "nid": -0.1, "teds": -0.1, "mhs": -0.1},
+ "bottomRegressionCases": []
+}
+```
+
+**Step 4: Run test to verify it passes**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test benchmark_corpus_contract opendataloader_comparison_report_requires_reference_and_candidate -- --nocapture
+```
+
+Expected: PASS.
+
+**Step 5: Commit**
+
+```bash
+cd DocTruth
+git add runtime/doctruth-runtime/src/lib.rs runtime/doctruth-runtime/tests/benchmark_corpus_contract.rs
+git commit -m "feat: compare opendataloader benchmark reports"
+```
+
+---
+
+### Task 12: Update Done Criteria In Product Docs
+
+**Files:**
+- Modify: `docs/pdf-parser-runtime-prd.md`
+- Modify: `docs/parser-capability-matrix.md`
+- Modify: `DocTruth/AGENTS.md`
+
+**Step 1: Write the failing docs check**
+
+Create or update a lightweight docs contract in `runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs`:
+
+```rust
+#[test]
+fn docs_do_not_claim_full_opendataloader_parity_before_report_gate() {
+ let repo = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../..");
+ for path in [
+ "docs/pdf-parser-runtime-prd.md",
+ "docs/parser-capability-matrix.md",
+ "DocTruth/AGENTS.md",
+ ] {
+ let text = fs::read_to_string(repo.join(path)).expect(path);
+ assert!(
+ !text.contains("OpenDataLoader parity complete"),
+ "{path} must not claim full parity without full200 gate"
+ );
+ }
+}
+```
+
+**Step 2: Run test to verify it passes or fails**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract docs_do_not_claim_full_opendataloader_parity_before_report_gate -- --nocapture
+```
+
+Expected: PASS unless docs overclaim.
+
+**Step 3: Update docs**
+
+Add this wording to the relevant docs:
+
+```markdown
+OpenDataLoader parity is measured, not asserted. A behavior is considered
+ported only when it has a Rust contract test, an upstream source reference,
+and either a focused OpenDataLoader Bench case or a full200 report showing the
+effect. Until full200 reaches the accepted baseline, DocTruth should be
+described as OpenDataLoader-inspired and progressively porting parity, not
+OpenDataLoader-equivalent.
+```
+
+**Step 4: Run docs and diff checks**
+
+Run:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract docs_do_not_claim_full_opendataloader_parity_before_report_gate -- --nocapture
+git diff --check
+```
+
+Expected: PASS.
+
+**Step 5: Commit**
+
+```bash
+cd DocTruth
+git add docs/pdf-parser-runtime-prd.md docs/parser-capability-matrix.md DocTruth/AGENTS.md runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs
+git commit -m "docs: define opendataloader parity done criteria"
+```
+
+---
+
+## Execution Order
+
+Use this order:
+
+```text
+Task 1 coverage matrix
+Task 2 source attribution
+Task 3 matrix command
+Task 4 text processor
+Task 5 line/paragraph processor
+Task 6 structure processors
+Task 7 table processors
+Task 8 model runtime gaps
+Task 9 full200 gate
+Task 10 fresh full200 report
+Task 11 hybrid comparison
+Task 12 docs done criteria
+```
+
+Commit after each task. Do not batch multiple processor ports into one commit unless they share the same upstream test fixture and failure class.
+
+## Verification Checklist
+
+Run before claiming the plan is complete:
+
+```bash
+cd DocTruth
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --lib
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract -- --nocapture
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_text_processor_contract -- --nocapture
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_line_paragraph_contract -- --nocapture
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_structure_contract -- --nocapture
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_table_processor_contract -- --nocapture
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_triage_contract -- --nocapture
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_model_runtime_contract -- --nocapture
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test borderless_table_contract
+cargo fmt --manifest-path runtime/doctruth-runtime/Cargo.toml -- --check
+git diff --check
+```
+
+Then run the explicit full200 gate once:
+
+```bash
+cd DocTruth
+printf '%s' '{
+ "command": "opendataloader_prediction",
+ "bench_dir": "third_party/opendataloader-bench",
+ "output_dir": "third_party/opendataloader-bench/prediction/doctruth-rust-opendataloader-full200-2026-06-23",
+ "engine": "doctruth-rust-opendataloader-full200-2026-06-23",
+ "preset": "edge-fast",
+ "profile": "edge-fast",
+ "allow_full200": true,
+ "timeout_seconds": 30
+}' | cargo run --manifest-path runtime/doctruth-runtime/Cargo.toml --quiet --bin doctruth-runtime
+```
+
+Record the result in `docs/parser/reports/opendataloader-full200-2026-06-23.md`.
diff --git a/docs/plans/2026-06-27-opendataloader-pipeline-parity-design.md b/docs/plans/2026-06-27-opendataloader-pipeline-parity-design.md
new file mode 100644
index 00000000..daa24243
--- /dev/null
+++ b/docs/plans/2026-06-27-opendataloader-pipeline-parity-design.md
@@ -0,0 +1,207 @@
+# OpenDataLoader Pipeline Parity Design
+
+## Goal
+
+Make DocTruth converge on OpenDataLoader-quality parsing by aligning the
+processor pipeline as a whole, not by tuning individual benchmark samples.
+
+The target is not to make OpenDataLoader output canonical. `TrustDocument`
+remains canonical. OpenDataLoader is the behavior reference for parser-quality
+processors, benchmark fixtures, and full200 evaluation.
+
+## Current Problem
+
+DocTruth already has useful OpenDataLoader-inspired behavior:
+
+- text filtering probes
+- paragraph and structure probes
+- heading/list/caption slices
+- table border and classifier probes
+- Java-core full200 benchmark runs
+- Rust runtime/model-worker orchestration
+- MNN OCR/table routing hooks
+
+The remaining issue is structural. Many rules are implemented as focused
+heuristics or case-family repairs. That raises full200 scores, but it does not
+prove that DocTruth follows the same processor-level output behavior as
+OpenDataLoader. This makes future changes fragile: fixing one low-score sample
+can regress another layout class.
+
+## Selected Approach
+
+Use a dedicated OpenDataLoader pipeline-parity layer.
+
+This layer does not create a second canonical schema. It records and enforces
+the processor order, inputs, outputs, warnings, and parity status needed to
+normalize OpenDataLoader-like behavior into DocTruth-owned `TrustDocument`
+output.
+
+Rejected alternatives:
+
+- Low-score-sample tuning first: useful for triage, but it keeps the project in
+ sample-patch mode.
+- Rust/MNN replacement first: important for resource profile, but current
+ quality gaps are mostly processor behavior and output semantics, not the
+ runtime language.
+
+## Reference Pipeline
+
+The parity layer should model this processor order:
+
+```text
+PDF text extraction
+-> text normalization
+-> hidden/off-page/tiny/duplicate filtering
+-> line grouping
+-> paragraph merge
+-> heading hierarchy
+-> list grouping
+-> caption binding
+-> table border detection
+-> borderless table clustering
+-> table structure normalization
+-> chart/table false-positive gate
+-> OCR/table model routing
+-> reading order
+-> TrustDocument export
+```
+
+Every stage must answer:
+
+```text
+What does OpenDataLoader do?
+What does DocTruth do now?
+Is the DocTruth behavior matched, partial, missing, skipped, or blocked?
+Which focused contract test proves it?
+Which full200 bucket or case evidence proves it at corpus level?
+```
+
+## Components
+
+### 1. Processor Parity Matrix
+
+Add a checked-in matrix that lists upstream processor coverage. Each row should
+include:
+
+- processor name
+- upstream source path or source area
+- DocTruth owner module
+- status: `matched`, `partial`, `missing`, `intentionally_skipped`, `blocked`
+- focused test path
+- full200 evidence artifact
+- remaining gap
+
+The matrix is an engineering control. It prevents vague claims such as "we
+ported OpenDataLoader" when only selected behavior slices are implemented.
+
+### 2. Pipeline Parity Module
+
+Create a runtime-visible parity module that exposes processor metadata and
+expected stage order. This module should not parse PDFs itself at first. Its
+job is to make pipeline shape testable and to give focused processors a common
+place to register behavior contracts.
+
+The module should support JSON output so benchmark scripts, docs, and doctor
+commands can all consume the same status.
+
+### 3. Heuristic Rehoming
+
+Move existing scattered behavior into named processor areas:
+
+- text noise rules belong to the text/content filter processor
+- line and paragraph rules belong to paragraph processor
+- heading/list/caption rules belong to structure processor slices
+- table repairs belong to table processor slices
+- OCR rescue belongs to routing/model processor slices
+- Markdown prediction repairs belong behind the owning processor, not as
+ untracked global post-processing
+
+This does not require a large rewrite in one commit. It requires every new
+rule to land under a named processor with a focused contract test.
+
+### 4. Processor Behavior Contract Tests
+
+These are not tests for a single PDF id. They are tests for a behavior family.
+
+Examples:
+
+```text
+ListProcessor:
+- bullet list
+- numbered list
+- wrapped continuation
+- nested list
+- numbered heading must not be swallowed as a list
+
+TableProcessor:
+- bordered table
+- borderless clustered table
+- merged header cells
+- multi-segment rowspans
+- chart or survey figure must not become a table
+
+ReadingOrderProcessor:
+- two columns
+- full-width heading between columns
+- sidebar plus body
+- header/footer furniture removal
+```
+
+The point is to stop case-specific fixes. A processor contract should fail when
+a whole behavior class is broken, even if one benchmark sample happens to pass.
+
+### 5. Benchmark Gate
+
+Full200 is the stage gate, not the inner loop.
+
+Focused contract tests run during processor porting. Full200 runs only after a
+coherent set of processors is changed. Reports must include:
+
+- overall, NID, TEDS, MHS
+- parsed and failed counts
+- latency and resource metadata
+- low-score buckets by failure class
+- source artifact path
+- comparison against the previous accepted DocTruth run
+- comparison against the OpenDataLoader reference run when available
+
+## Data Flow
+
+```text
+PDF
+-> current Java-core/OpenDataLoader-compatible parser or Rust parser shell
+-> named processor behavior slices
+-> TrustDocument
+-> OpenDataLoader Bench-compatible prediction artifact
+-> evaluator
+-> parser-quality report
+-> parity matrix update
+```
+
+OpenDataLoader outputs and benchmark predictions are observations. They do not
+replace `TrustDocument`.
+
+## Error Handling
+
+Severe parser disagreement must be explicit. The runtime should emit warnings
+or block audit-grade status when it sees:
+
+- uncertain reading order
+- failed quote anchoring
+- missing visual bbox
+- low-confidence table structure
+- OCR rescue replacing readable text-layer output without a quality gate
+- processor output conflict between Java-core and Rust/model route
+
+## Acceptance Criteria
+
+This design is accepted when:
+
+1. The parity matrix exists and is checked by tests.
+2. The processor order is exposed through runtime metadata.
+3. Existing scattered heuristics are mapped to named processor owners.
+4. Each new parity improvement uses a processor behavior contract test first.
+5. Full200 reports are used only at stage gates and include low-score buckets.
+6. No production parser path depends on Python/Torch/Docling residency.
+7. `TrustDocument` remains the canonical output.
+
diff --git a/docs/plans/2026-06-27-opendataloader-pipeline-parity-implementation-plan.md b/docs/plans/2026-06-27-opendataloader-pipeline-parity-implementation-plan.md
new file mode 100644
index 00000000..c00bd3e4
--- /dev/null
+++ b/docs/plans/2026-06-27-opendataloader-pipeline-parity-implementation-plan.md
@@ -0,0 +1,636 @@
+# OpenDataLoader Pipeline Parity Implementation Plan
+
+> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
+
+**Goal:** Build a processor-level OpenDataLoader parity layer so DocTruth ports behavior by processor contract instead of tuning individual benchmark samples.
+
+**Architecture:** `TrustDocument` remains canonical. The parity layer exposes OpenDataLoader processor order, coverage status, focused contract ownership, and benchmark evidence through Rust runtime metadata and checked-in docs. Existing Java-core/OpenDataLoader-compatible behavior remains the current quality oracle while Rust owns runtime metadata, model/process orchestration, benchmark commands, and future replacement seams.
+
+**Tech Stack:** Rust `doctruth-runtime`, Cargo tests, existing Java-core benchmark path, OpenDataLoader Bench artifacts, Markdown docs, shell benchmark scripts.
+
+---
+
+## Guardrails
+
+- Do not replace `TrustDocument` with OpenDataLoader JSON or Markdown.
+- Do not add Python/Torch/Docling production residency.
+- Do not run full200 after every tiny change.
+- Do not tune by PDF id unless the rule is generalized under a named processor.
+- Commit each task separately.
+- Preserve existing uncommitted work unless the user explicitly asks to fold it into a task.
+
+## Task 1: Add Runtime Processor Parity Matrix
+
+**Files:**
+- Create: `runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs`
+- Create: `runtime/doctruth-runtime/src/opendataloader_parity.rs`
+- Modify: `runtime/doctruth-runtime/src/lib.rs`
+- Create: `docs/parser/opendataloader-parity-matrix.md`
+
+**Step 1: Write the failing test**
+
+Create `runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs`:
+
+```rust
+use doctruth_runtime::opendataloader_parity_matrix_json;
+
+#[test]
+fn opendataloader_parity_matrix_lists_required_processors() {
+ let matrix = opendataloader_parity_matrix_json();
+ let processors = matrix["processors"].as_array().expect("processors array");
+ let names = processors
+ .iter()
+ .filter_map(|entry| entry["upstream"].as_str())
+ .collect::>();
+
+ for expected in [
+ "DocumentProcessor",
+ "TaggedDocumentProcessor",
+ "TextProcessor",
+ "TextLineProcessor",
+ "ParagraphProcessor",
+ "HeadingProcessor",
+ "ListProcessor",
+ "CaptionProcessor",
+ "LevelProcessor",
+ "HeaderFooterProcessor",
+ "ContentFilterProcessor",
+ "TextDecorationProcessor",
+ "TableBorderProcessor",
+ "ClusterTableProcessor",
+ "SpecialTableProcessor",
+ "TableStructureNormalizer",
+ "HybridDocumentProcessor",
+ "TriageProcessor",
+ ] {
+ assert!(names.contains(&expected), "missing {expected}");
+ }
+}
+
+#[test]
+fn opendataloader_parity_matrix_has_status_and_owner_for_every_processor() {
+ let matrix = opendataloader_parity_matrix_json();
+ let processors = matrix["processors"].as_array().expect("processors array");
+
+ assert!(!processors.is_empty());
+ for entry in processors {
+ assert!(entry["upstream"].as_str().is_some(), "missing upstream");
+ assert!(entry["status"].as_str().is_some(), "missing status for {entry:?}");
+ assert!(entry["doc_truth_owner"].as_str().is_some(), "missing owner for {entry:?}");
+ assert!(entry["focused_test"].as_str().is_some(), "missing focused test for {entry:?}");
+ }
+}
+```
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract -- --nocapture
+```
+
+Expected: fail because `opendataloader_parity_matrix_json` does not exist.
+
+**Step 3: Add the minimal runtime module**
+
+Create `runtime/doctruth-runtime/src/opendataloader_parity.rs`:
+
+```rust
+use serde_json::{json, Value};
+
+pub fn opendataloader_parity_matrix_json() -> Value {
+ json!({
+ "schema": "doctruth.opendataloader.parity_matrix.v1",
+ "canonical_output": "TrustDocument",
+ "processors": processors(),
+ })
+}
+
+fn processors() -> Vec {
+ vec![
+ row("DocumentProcessor", "partial", "document_parse", "benchmark_corpus_contract"),
+ row("TaggedDocumentProcessor", "partial", "structure_tree", "benchmark_corpus_contract"),
+ row("TextProcessor", "partial", "text_filter", "opendataloader_text_processor_contract"),
+ row("TextLineProcessor", "partial", "line_grouping", "opendataloader_line_paragraph_contract"),
+ row("ParagraphProcessor", "partial", "paragraph_merge", "opendataloader_line_paragraph_contract"),
+ row("HeadingProcessor", "partial", "structure_probe", "opendataloader_structure_contract"),
+ row("ListProcessor", "partial", "structure_probe", "opendataloader_structure_contract"),
+ row("CaptionProcessor", "partial", "structure_probe", "opendataloader_structure_contract"),
+ row("LevelProcessor", "partial", "structure_probe", "opendataloader_structure_contract"),
+ row("HeaderFooterProcessor", "partial", "header_footer", "PdfDocumentParserTest"),
+ row("ContentFilterProcessor", "partial", "content_filter_probe", "opendataloader_content_filter_probe"),
+ row("TextDecorationProcessor", "partial", "text_decoration", "opendataloader_text_processor_contract"),
+ row("TableBorderProcessor", "partial", "table_border_probe", "opendataloader_table_processor_contract"),
+ row("ClusterTableProcessor", "partial", "table_cluster", "opendataloader_table_processor_contract"),
+ row("SpecialTableProcessor", "partial", "table_special_cases", "opendataloader_table_processor_contract"),
+ row("TableStructureNormalizer", "partial", "table_normalizer", "opendataloader_table_processor_contract"),
+ row("HybridDocumentProcessor", "partial", "java_core_auto_mnn", "benchmark_corpus_contract"),
+ row("TriageProcessor", "partial", "triage_probe", "opendataloader_triage_probe"),
+ ]
+}
+
+fn row(upstream: &str, status: &str, owner: &str, test: &str) -> Value {
+ json!({
+ "upstream": upstream,
+ "status": status,
+ "doc_truth_owner": owner,
+ "focused_test": test,
+ "full200_evidence": "",
+ "remaining_gap": "tracked in docs/parser/opendataloader-processor-gap-report.md",
+ })
+}
+```
+
+Modify `runtime/doctruth-runtime/src/lib.rs`:
+
+```rust
+pub mod opendataloader_parity;
+pub use opendataloader_parity::opendataloader_parity_matrix_json;
+```
+
+If `serde_json` is already available in the crate, reuse it. If not, add it to
+the existing dependency list only after confirming `Cargo.toml`.
+
+**Step 4: Add the checked-in matrix doc**
+
+Create `docs/parser/opendataloader-parity-matrix.md` with the same processor
+rows, status definitions, and pointer to `docs/parser/opendataloader-processor-gap-report.md`.
+
+**Step 5: Run tests**
+
+Run:
+
+```bash
+cargo fmt --manifest-path runtime/doctruth-runtime/Cargo.toml -- --check
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract -- --nocapture
+git diff --check
+```
+
+Expected: pass.
+
+**Step 6: Commit**
+
+```bash
+git add runtime/doctruth-runtime/src/lib.rs \
+ runtime/doctruth-runtime/src/opendataloader_parity.rs \
+ runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs \
+ docs/parser/opendataloader-parity-matrix.md
+git commit -m "feat: add opendataloader parity matrix"
+```
+
+## Task 2: Add Pipeline Stage Order Contract
+
+**Files:**
+- Modify: `runtime/doctruth-runtime/src/opendataloader_parity.rs`
+- Modify: `runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs`
+
+**Step 1: Write the failing test**
+
+Add this test:
+
+```rust
+#[test]
+fn opendataloader_pipeline_stage_order_is_explicit() {
+ let matrix = opendataloader_parity_matrix_json();
+ let stages = matrix["pipeline_stages"].as_array().expect("pipeline stages");
+ let names = stages
+ .iter()
+ .filter_map(|stage| stage["name"].as_str())
+ .collect::>();
+
+ assert_eq!(
+ names,
+ vec![
+ "pdf_text_extraction",
+ "text_normalization",
+ "content_filtering",
+ "line_grouping",
+ "paragraph_merge",
+ "heading_hierarchy",
+ "list_grouping",
+ "caption_binding",
+ "table_border_detection",
+ "borderless_table_clustering",
+ "table_structure_normalization",
+ "chart_table_gate",
+ "ocr_table_model_routing",
+ "reading_order",
+ "trust_document_export",
+ ]
+ );
+}
+```
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract opendataloader_pipeline_stage_order_is_explicit -- --nocapture
+```
+
+Expected: fail because `pipeline_stages` is missing.
+
+**Step 3: Implement stage metadata**
+
+Add `pipeline_stages()` to the parity module and include it in
+`opendataloader_parity_matrix_json()`.
+
+Each stage entry should include:
+
+```json
+{
+ "name": "text_normalization",
+ "owner": "TextProcessor",
+ "canonical_output": "TrustDocument intermediate block stream"
+}
+```
+
+Keep the data static and simple. Do not add runtime parser behavior in this
+task.
+
+**Step 4: Run tests**
+
+Run:
+
+```bash
+cargo fmt --manifest-path runtime/doctruth-runtime/Cargo.toml -- --check
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract -- --nocapture
+git diff --check
+```
+
+Expected: pass.
+
+**Step 5: Commit**
+
+```bash
+git add runtime/doctruth-runtime/src/opendataloader_parity.rs \
+ runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs
+git commit -m "feat: expose opendataloader pipeline stage order"
+```
+
+## Task 3: Add Processor Ownership Contract for Existing Heuristics
+
+**Files:**
+- Modify: `runtime/doctruth-runtime/src/opendataloader_parity.rs`
+- Modify: `runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs`
+- Modify: `docs/parser/opendataloader-parity-matrix.md`
+
+**Step 1: Write the failing test**
+
+Add a test that checks existing high-risk heuristic owners:
+
+```rust
+#[test]
+fn existing_heuristics_are_mapped_to_processor_owners() {
+ let matrix = opendataloader_parity_matrix_json();
+ let heuristics = matrix["heuristic_owners"].as_array().expect("heuristic owners");
+ let names = heuristics
+ .iter()
+ .filter_map(|entry| entry["heuristic"].as_str())
+ .collect::>();
+
+ for expected in [
+ "hidden_offpage_tiny_duplicate_text_filter",
+ "right_aligned_paragraph_precedence",
+ "wrapped_list_continuation",
+ "nested_list_hierarchy",
+ "caption_marker_classification",
+ "survey_chart_table_rejection",
+ "borderless_cluster_table_reconstruction",
+ "ocr_rescue_sparse_java_output_only",
+ "prediction_markdown_repair",
+ ] {
+ assert!(names.contains(&expected), "missing heuristic owner {expected}");
+ }
+}
+```
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract existing_heuristics_are_mapped_to_processor_owners -- --nocapture
+```
+
+Expected: fail because `heuristic_owners` is missing.
+
+**Step 3: Implement heuristic owner metadata**
+
+Add `heuristic_owners()` to the parity module. Each entry should include:
+
+```json
+{
+ "heuristic": "wrapped_list_continuation",
+ "processor": "ListProcessor",
+ "owner": "structure_probe",
+ "focused_test": "opendataloader_structure_contract"
+}
+```
+
+Do not move implementation code yet. This task records ownership and creates
+the contract that future code moves must satisfy.
+
+**Step 4: Update matrix doc**
+
+Add a "Heuristic Ownership" section to
+`docs/parser/opendataloader-parity-matrix.md`.
+
+**Step 5: Run tests**
+
+Run:
+
+```bash
+cargo fmt --manifest-path runtime/doctruth-runtime/Cargo.toml -- --check
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract -- --nocapture
+git diff --check
+```
+
+Expected: pass.
+
+**Step 6: Commit**
+
+```bash
+git add runtime/doctruth-runtime/src/opendataloader_parity.rs \
+ runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs \
+ docs/parser/opendataloader-parity-matrix.md
+git commit -m "feat: map parser heuristics to opendataloader processors"
+```
+
+## Task 4: Add Behavior-Family Contract Buckets
+
+**Files:**
+- Modify: `runtime/doctruth-runtime/src/opendataloader_parity.rs`
+- Modify: `runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs`
+- Modify: `docs/parser/opendataloader-parity-matrix.md`
+
+**Step 1: Write the failing test**
+
+Add a test that ensures behavior-family coverage is represented:
+
+```rust
+#[test]
+fn processor_contract_buckets_cover_behavior_families_not_pdf_ids() {
+ let matrix = opendataloader_parity_matrix_json();
+ let buckets = matrix["contract_buckets"].as_array().expect("contract buckets");
+ let names = buckets
+ .iter()
+ .filter_map(|entry| entry["bucket"].as_str())
+ .collect::>();
+
+ for expected in [
+ "text_noise_filtering",
+ "two_column_reading_order",
+ "sidebar_reading_order",
+ "paragraph_merge",
+ "heading_hierarchy",
+ "list_grouping",
+ "caption_binding",
+ "bordered_tables",
+ "borderless_tables",
+ "table_false_positive_rejection",
+ "ocr_sparse_page_rescue",
+ ] {
+ assert!(names.contains(&expected), "missing contract bucket {expected}");
+ }
+}
+```
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract processor_contract_buckets_cover_behavior_families_not_pdf_ids -- --nocapture
+```
+
+Expected: fail because `contract_buckets` is missing.
+
+**Step 3: Implement bucket metadata**
+
+Add `contract_buckets()` to the parity module. Each bucket should include:
+
+```json
+{
+ "bucket": "borderless_tables",
+ "processor": "ClusterTableProcessor",
+ "contract_style": "behavior_family",
+ "not_pdf_id_patch": true
+}
+```
+
+**Step 4: Update docs**
+
+Add examples explaining that a processor contract covers a behavior family and
+must not be a single benchmark PDF id patch.
+
+**Step 5: Run tests**
+
+Run:
+
+```bash
+cargo fmt --manifest-path runtime/doctruth-runtime/Cargo.toml -- --check
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract -- --nocapture
+git diff --check
+```
+
+Expected: pass.
+
+**Step 6: Commit**
+
+```bash
+git add runtime/doctruth-runtime/src/opendataloader_parity.rs \
+ runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs \
+ docs/parser/opendataloader-parity-matrix.md
+git commit -m "feat: add opendataloader behavior contract buckets"
+```
+
+## Task 5: Add Stage-Gated Benchmark Report Contract
+
+**Files:**
+- Modify: `runtime/doctruth-runtime/src/opendataloader_parity.rs`
+- Modify: `runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs`
+- Modify: `docs/parser/opendataloader-parity-matrix.md`
+- Modify: `scripts/run-opendataloader-java-core-parity.sh`
+- Modify: `scripts/run-doctruth-opendataloader-bench.sh`
+
+**Step 1: Write the failing test**
+
+Add a Rust metadata test:
+
+```rust
+#[test]
+fn full200_gate_requires_metrics_resources_and_buckets() {
+ let matrix = opendataloader_parity_matrix_json();
+ let gate = &matrix["full200_gate"];
+
+ for key in [
+ "overall",
+ "nid",
+ "teds",
+ "mhs",
+ "parsed_count",
+ "failed_count",
+ "latency",
+ "resources",
+ "low_score_buckets",
+ "artifact_path",
+ "previous_doc_truth_baseline",
+ ] {
+ assert!(gate[key].is_string() || gate[key].is_array() || gate[key].is_object(), "missing {key}");
+ }
+}
+```
+
+**Step 2: Run test to verify it fails**
+
+Run:
+
+```bash
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract full200_gate_requires_metrics_resources_and_buckets -- --nocapture
+```
+
+Expected: fail because `full200_gate` is missing or incomplete.
+
+**Step 3: Implement full200 gate metadata**
+
+Add static metadata that defines required report fields. Do not hard-code the
+latest benchmark numbers as acceptance truth in the runtime module; this is the
+schema for reports, not the report itself.
+
+**Step 4: Update bench scripts**
+
+Ensure the scripts document or emit these fields in their generated report path:
+
+```text
+overall
+nid
+teds
+mhs
+parsed_count
+failed_count
+latency
+resources
+low_score_buckets
+artifact_path
+previous_doc_truth_baseline
+```
+
+Keep shell changes narrow. Do not rewrite the benchmark runner unless required.
+
+**Step 5: Run tests and script smoke**
+
+Run:
+
+```bash
+cargo fmt --manifest-path runtime/doctruth-runtime/Cargo.toml -- --check
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract -- --nocapture
+sh scripts/run-opendataloader-java-core-parity.sh --help || true
+sh scripts/run-doctruth-opendataloader-bench.sh --help || true
+git diff --check
+```
+
+Expected: Rust tests pass. Script help may exit nonzero if the script has no
+help mode, but it must not reveal syntax breakage from the edits.
+
+**Step 6: Commit**
+
+```bash
+git add runtime/doctruth-runtime/src/opendataloader_parity.rs \
+ runtime/doctruth-runtime/tests/opendataloader_parity_matrix_contract.rs \
+ docs/parser/opendataloader-parity-matrix.md \
+ scripts/run-opendataloader-java-core-parity.sh \
+ scripts/run-doctruth-opendataloader-bench.sh
+git commit -m "feat: define opendataloader full200 gate contract"
+```
+
+## Task 6: Update Gap Report to Use the Parity Matrix
+
+**Files:**
+- Modify: `docs/parser/opendataloader-processor-gap-report.md`
+- Modify: `docs/parser/opendataloader-parity-matrix.md`
+- Modify: `docs/plans/2026-06-23-opendataloader-parity-coverage-plan.md`
+
+**Step 1: Update docs**
+
+Add a short "Source of truth" section:
+
+```text
+The parity matrix owns processor status and processor-order metadata.
+The gap report owns detailed evidence and narrative.
+The implementation plan owns execution steps.
+```
+
+**Step 2: Remove contradictory wording**
+
+Make sure docs do not imply that:
+
+- a single sample fix is parity
+- Java is the destination parser core
+- OpenDataLoader output is canonical
+- full200 should run after every tiny change
+
+**Step 3: Run docs verification**
+
+Run:
+
+```bash
+git diff --check
+```
+
+Expected: pass.
+
+**Step 4: Commit**
+
+```bash
+git add docs/parser/opendataloader-processor-gap-report.md \
+ docs/parser/opendataloader-parity-matrix.md \
+ docs/plans/2026-06-23-opendataloader-parity-coverage-plan.md
+git commit -m "docs: align opendataloader parity docs"
+```
+
+## Task 7: Run Focused Verification and Prepare Full200 Gate
+
+**Files:**
+- No required source edits unless tests reveal a real metadata defect.
+
+**Step 1: Run focused tests**
+
+Run:
+
+```bash
+cargo fmt --manifest-path runtime/doctruth-runtime/Cargo.toml -- --check
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test opendataloader_parity_matrix_contract -- --nocapture
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test benchmark_corpus_contract opendataloader_prediction_ -- --nocapture
+cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml --test model_worker_contract -- --nocapture
+git diff --check
+```
+
+Expected: pass.
+
+**Step 2: Decide whether full200 is warranted**
+
+Only run full200 if Tasks 1-6 changed benchmark scripts or runtime output. If
+only metadata/docs changed, record that full200 is not required yet.
+
+**Step 3: Commit verification notes if docs changed**
+
+If verification notes are added:
+
+```bash
+git add docs/parser/opendataloader-processor-gap-report.md findings.md
+git commit -m "docs: record opendataloader parity verification"
+```
+
+## Final Handoff
+
+After Task 7, report:
+
+- commits created
+- tests run
+- whether full200 was run
+- whether the branch is clean except for pre-existing user changes
+- next processor family to port under the new contract
+
diff --git a/docs/plans/python-to-rust-parser-parity.md b/docs/plans/python-to-rust-parser-parity.md
new file mode 100644
index 00000000..82aafea6
--- /dev/null
+++ b/docs/plans/python-to-rust-parser-parity.md
@@ -0,0 +1,79 @@
+# Python to Rust Parser Parity Checklist
+
+DocTruth production runtime must be Rust-shell-owned, while current production
+parser quality is Java/OpenDataLoader-core-owned. Python paths are legacy
+oracle, smoke, or test harness support only. This checklist tracks the remaining
+outer-runtime behavior that must move from Python adapters into
+`runtime/doctruth-runtime` before OpenDataLoader Bench is used as a final
+acceptance gate.
+
+This checklist does not mean "rewrite Java/PDFBox parser quality in Rust now."
+The current parser-quality source of truth is the Java/OpenDataLoader-compatible
+backend. Rust owns benchmark packaging, warm process orchestration, resource
+accounting, model-worker protocols, and the Python replacement boundary.
+
+## Projection and Markdown
+
+- [x] Rust-owned OpenDataLoader prediction command and evaluator path for
+ packaging/reporting.
+- [x] Content block rendering without duplicated source units.
+- [x] Page-number noise filtering.
+- [x] Render TrustDocument tables as GFM-compatible HTML tables.
+- [x] Match Python heading promotion contract:
+ numbered headings, title-case headings, common single-word headings, and
+ numeric/table caption exclusions.
+- [x] Match Python linewise paragraph projection and optional paragraph-join
+ behavior.
+- [x] Match Python table-of-contents Markdown rendering for detected table
+ outputs.
+- [x] Match Python synthetic table reconstruction from ordered text lines.
+
+## Spatial Tables
+
+- [x] Match Python spatial segment boundaries:
+ row grouping, weak-row handling, minimum strong rows, and column density.
+- [x] Match Python table-likeness gates:
+ column count, median cell length, fill ratio, row width, and formula rejection.
+- [x] Match Python formula/list/TOC false-positive rejection before emitting
+ spatial table HTML.
+- [x] Match Python spatial-table output contract:
+ consume source units and append recovered table HTML after normal text
+ projection unless a later Rust-owned contract replaces that behavior.
+- [x] Match Python party-registration table reconstruction.
+
+## Model and Worker Runtime
+
+- [x] Replace default production discovery for OCR/table model routes with the
+ Rust MNN worker protocol entrypoint.
+- [x] Remove Python RapidOCR, SLANeXT/PaddleOCR, and ONNX worker adapters from
+ source install and release packaging.
+- [x] Make Rust MNN worker fail closed by default until real MNN inference is
+ wired; contract-smoke stub mode is explicit and non-audit-grade.
+- [x] Add optional `mnn-native` Rust feature using `mnn-rs` so native MNN
+ binding compilation is verified without bloating the default build.
+- [x] Add `doctruth-mnn-model-worker --probe-model` and an env-gated native MNN
+ smoke for real executable `.mnn` artifacts.
+- [x] Implement real MNN OCR inference path inside
+ `doctruth-mnn-model-worker` behind the `mnn-ocr` feature.
+- [ ] Validate real MNN OCR model pack quality against scanned-PDF fixtures.
+- [ ] Implement real MNN table/layout inference inside `doctruth-mnn-model-worker`.
+- [x] Replace Python ONNX model worker with Rust/MNN model worker or remove it
+ from production packaging.
+- [x] Keep Python model workers available only behind oracle/test opt-in if they
+ remain in the repository.
+- [x] Record model manifest, model SHA, profile, RSS, latency, and unload
+ behavior for each accepted edge-model profile.
+
+Native MNN acceptance requires a real executable `.mnn` model. Benchmark-only
+or shape-only artifacts with stripped weights are useful for parser plumbing
+tests but do not count as inference acceptance.
+
+## Benchmark Boundary
+
+- [x] Default OpenDataLoader runner refuses Python oracle unless explicitly
+ opted in.
+- [x] OpenDataLoader Bench corpus is vendored under `third_party/`.
+- [ ] Full OpenDataLoader Bench acceptance runs only after the Rust contract
+ parity items above are covered by tests.
+- [ ] Benchmark report must include scores, speed, resource profile, source
+ hashes, and remaining quality gaps.
diff --git a/docs/use-cases/auditable-llm-extraction-java.md b/docs/use-cases/auditable-llm-extraction-java.md
index 62811fde..95549158 100644
--- a/docs/use-cases/auditable-llm-extraction-java.md
+++ b/docs/use-cases/auditable-llm-extraction-java.md
@@ -1,9 +1,9 @@
-# Auditable LLM Extraction for Java
+# Auditable LLM Extraction With The Java Wrapper
-DocTruth is for Java teams that need structured LLM extraction results they can
-defend later. It parses business documents, asks a model for schema-bound
-output, validates the result, and attaches source evidence to each extracted
-field.
+DocTruth is for teams that need structured LLM extraction results they can
+defend later. The parser core is Rust; the Java API is the SDK/CLI wrapper for
+Java services that need to call that runtime, ask a model for schema-bound
+output, validate the result, and attach source evidence to each extracted field.
The core use case is simple:
diff --git a/findings.md b/findings.md
new file mode 100644
index 00000000..7a6294bd
--- /dev/null
+++ b/findings.md
@@ -0,0 +1,2228 @@
+# DocTruth v1 Parser Runtime Findings
+
+## Current State
+
+- Repository branch is `feat/v1-trust-document-runtime-tdd`.
+- `docs/pdf-parser-runtime-prd.md` is committed as `a22c7b6 docs: add v1 parser runtime prd`.
+- Worktree has pre-existing dirty changes unrelated to the PRD commit:
+ CLI parse/Markdown/OCR files and tests are modified or untracked.
+- Project is a Java 25 Maven single module.
+- Existing public parser model includes `ParsedDocument`, `ParsedSection`,
+ `TextSection`, `TableSection`, `FigureSection`, `SourceLocation`,
+ `BoundingBox`, and `Citation`.
+- Current PDF parsing path is Java/PDFBox based, with layout-related classes:
+ `PdfDocumentParser`, `PdfPageBlockExtractor`, `PdfVisualTextLayout`,
+ `PdfLineSegment`, `PdfSemanticSectionCoalescer`, and related helpers.
+
+## PRD Requirements Extracted
+
+- `TrustDocument` is the canonical evidence-carrying document representation.
+- `TrustUnit` is the smallest stable citeable unit inside a `TrustDocument`.
+- JSON full output must be lossless for the contract.
+- Clean Markdown is a consumption view and not audit-grade by itself.
+- Source maps must resolve rendered output back to evidence spans.
+- Compact LLM wire format must be deterministic and materially smaller than
+ full JSON while preserving evidence ids and hierarchy.
+- Parser uncertainty must be represented as structured warnings.
+- Severe parser warnings block audit-grade status.
+- Backend design should allow PDFBox baseline now and Rust runtime later.
+
+## Implementation Constraints
+
+- Do not copy Kreuzberg implementation code; use the PRD behavior contracts only.
+- Avoid broad refactors until contract tests require them.
+- Existing dirty changes may be user/previous work; do not revert them.
+- `ArchitectureContractTest` enforces public records with at most 5 components.
+ Therefore v1 PRD shapes such as `TrustDocument` and `TrustUnit` must be
+ decomposed into small records instead of one wide public record.
+- Existing code style uses immutable public records with compact constructors,
+ null/blank validation, `List.copyOf`, and focused contract tests.
+
+## Current Contract Slice
+
+- The implemented slice is a Java public-contract/runtime-baseline slice, not a
+ real Rust parser runtime.
+- `TrustDocument` now supports lossless/evidence JSON, clean Markdown, compact
+ LLM wire, Markdown source-map rendering, HTML review anchors, and LLM/RAG
+ chunks with unit/evidence ids.
+- `TrustHtml.toMarkdownPassthrough` is intentionally conservative and uses
+ existing dependencies. A richer HTML5/GFM renderer should be a separate ADR
+ and dependency decision.
+- The baseline parser backend is `PdfBoxParserBackend`; it proves the backend
+ SPI and local/offline path while leaving the Rust runtime behind the same
+ contract for a later implementation.
+- `ModelRuntimePolicy` currently locks local-first policy behavior: lite mode
+ has no required models, offline required models produce severe warnings, and
+ online required models declare network access.
+- CLI now has two parse surfaces:
+ old `--json`/`--markdown` remain backward-compatible `ParsedDocument`
+ renderings, while new `--format ... --profile ...` emits v1
+ `TrustDocument` outputs.
+- Doctor is now closer to PRD runtime readiness: it reports parser backend
+ availability, model cache location, required model count, no-network lite
+ mode, and JVM memory estimates. It still does not verify real model SHA files
+ because no real model manifest/cache implementation exists yet.
+- `TrustDocumentParser` now exposes file/bytes/input-stream/batch entrypoints
+ over the Java/PDFBox baseline. It is a contract-compatible parser API, not a
+ real Rust sidecar.
+- `DocTruthDocument.withParser(ParserPreset).parse()` makes the PRD-style SDK
+ path usable without breaking the existing extraction-oriented
+ `fromPdf(...).extractJson(...)` flow.
+- `ModelCacheVerifier` now verifies local artifact existence and SHA-256 for
+ model descriptors, returning severe warnings for missing or mismatched files.
+ It does not download models or run ONNX.
+- `ParserBenchmarkRunner` is a lightweight metric runner for parsed
+ `TrustDocument` fixtures. It now has an acceptance-threshold gate through
+ `requireMinimums(...)`, so CI can fail when a metric falls below a configured
+ minimum. `ParserBenchmarkCase` can also carry an expected `TrustDocument`;
+ when present, the runner reports `bbox_iou` and `table_cell_f1` for
+ layout/table quality gates. The labeled PDF benchmark corpus and real
+ parser-quality targets still need real fixtures before parser quality can be
+ claimed.
+- `SidecarParserBackend` now proves the Java-side Phase 2 protocol boundary:
+ JSON request over stdin, canonical `TrustDocument` JSON over stdout, and
+ structured crash/invalid-response error mapping. This is not the Rust
+ `doctruth-runtime` binary; it is the Java adapter that the binary can satisfy.
+- CLI sidecar wiring is now present for TrustDocument outputs:
+ `doctruth parse --backend sidecar --runtime --preset standard --format ...`.
+ Summary and legacy ParsedDocument outputs remain PDFBox-only because the
+ sidecar returns the v1 `TrustDocument` contract, not the old Java
+ `ParsedDocument` model.
+- A real Rust `doctruth-runtime` binary now exists under
+ `runtime/doctruth-runtime`. It is intentionally a protocol MVP:
+ `--doctor`, stdin `parse_pdf`, and stable JSON errors are implemented.
+- Historical note: the first real Rust text-layer slice used `pdf-extract`.
+ The current runtime has since moved page text extraction to `pdf_oxide`.
+ A text-layer PDF can produce citeable `LINE_SPAN` units with
+ text, page, reading order, evidence span id, confidence, and a page-level bbox
+ fallback.
+- Page-level text extraction now uses `pdf_oxide` column-aware page text, so
+ multi-page text-layer PDFs produce one `TrustPage` per page. Text-bearing
+ pages are split into stable line-level units, which is a better fit for
+ evidence replay than the earlier page-level block fallback.
+- Missing or unreadable PDFs now fail with stable runtime error JSON:
+ `PDF_EXTRACTION_FAILED`.
+- The Rust text-layer slice is still not fully layout-grade: page text uses
+ `pdf_oxide` column-aware extraction, while precise positioned bboxes, table
+ extraction, and rendered page hashes still come from transitional
+ `lopdf`/`pdftoppm` support.
+- Adding `pdf_oxide` materially increases the Rust dependency tree because its
+ rendering path brings PDF/font/image/rendering dependencies. ADR 0010 should
+ be refreshed before release to record the updated backend tradeoff.
+- `scripts/smoke-doctruth-runtime.sh` is the repeatable local smoke for the
+ runtime binary. It builds/tests the crate, checks `--doctor`, generates a
+ real PDF fixture, and validates the extracted `TrustDocument` text unit.
+- `scripts/smoke-doctruth-cli-sidecar.sh` is the repeatable end-to-end smoke
+ for the Java CLI plus Rust sidecar. It builds the shaded CLI and runtime,
+ generates a real text-layer PDF, runs sidecar mode, validates JSON full
+ `LINE_SPAN` output, and validates clean Markdown plus source-map output.
+- The next parser-quality phase should add measurable parser-quality fixtures
+ and then improve Rust output beyond page-level fallback: precise bbox
+ evidence, column-aware reading order, and table/layout/OCR model execution
+ behind separate tests and dependency ADRs.
+- The CLI sidecar smoke proves integration, not parser quality. It does not
+ validate real-world layout PDFs, precise bboxes, multi-column reading order,
+ table cells, OCR, or model-assisted layout/table extraction.
+- `ParserBenchmarkCase.fromPdf(...)` now lets parser-quality tests start from
+ actual PDF files. This closes an important testability gap: benchmark gates
+ can now exercise the parser before scoring `TrustDocument` output.
+- `ParserBenchmarkRunner` now reports `bbox_coverage` for every case. This is
+ weaker than human-labeled `bbox_iou`, but it catches regressions where a
+ parser emits citeable units without any bbox anchors.
+- The current real-PDF benchmark fixture uses generated PDFs and the Java/PDFBox
+ baseline. It is not a substitute for a labeled real-world PDF corpus with
+ human-reviewed expected bboxes, table cells, OCR text, and reading order.
+- `ParserBenchmarkCase.fromPdf(..., expectedDocument)` now closes the next
+ benchmark gap: quality gates can parse a real PDF and immediately compare the
+ parsed `TrustDocument` against expected bbox labels through `bbox_iou`.
+- The current expected-bbox fixture uses broad manual normalized boxes and a
+ conservative IoU threshold. That is useful for regression protection, but the
+ PRD still needs human-reviewed labeled fixtures for precise bbox quality.
+- The Java/PDFBox baseline now has a conservative bordered-table path. PDF
+ graphics extraction records vertical separators, a full-grid detector maps
+ text positions into row/column cells, and generated real-PDF fixtures can
+ pass `table_cell_f1` against expected `TrustTable` cells.
+- Detected bordered-table regions now suppress overlapping `TEXT_BLOCK` output
+ before appending `TableSection`s. This keeps clean Markdown and LLM-facing
+ output from duplicating table cell text.
+- `TableSection` now carries an optional table-region bbox. The Java/PDFBox
+ bordered-table path preserves that region into `TrustTable.boundingBox`, and
+ benchmark cases can gate it with `table_region_iou`.
+- `TableSection` now also carries immutable per-cell `TableCellRegion` entries
+ for simple bordered-grid tables. The Java/PDFBox path propagates those cell
+ bboxes into `TrustTableCell.boundingBox` and each emitted `TABLE_CELL`
+ `TrustUnitLocation.boundingBox`, so downstream evidence consumers can anchor
+ individual cell values rather than only whole table regions.
+- Clean Markdown table output now uses GFM pipe-table shape for `TrustTable`
+ (`| header | ... |`, separator row, body rows). Markdown source-map rendering
+ uses the same table shape and maps each rendered cell value back to its
+ `TABLE_CELL` unit id and evidence span ids.
+- The Markdown renderer is still a focused local renderer, not a full Comrak
+ stack. HTML/Djot/plain cross-format parity, complete escaping rules, and
+ richer block-node rendering remain future PRD work.
+- The Rust sidecar now has a narrow bordered-grid table extraction path. It
+ directly uses `lopdf` with default features disabled to parse content stream
+ operations, detects simple `m/l/S` grid lines, maps `Td/Tj` text positions
+ into cells, and emits `TrustTable`, `TrustTableCell`, and `TABLE_CELL` units
+ with normalized bboxes.
+- Runtime and CLI sidecar smoke now cover both line-level text extraction and
+ generated bordered-table extraction. The CLI smoke proves Java can consume
+ sidecar table JSON and render the resulting clean Markdown table.
+- The bordered-table path is intentionally narrow. It does not claim borderless
+ table recognition, merged-cell inference, multi-page table continuation,
+ OCR-backed table extraction, model-assisted table structure recognition, or
+ full Java/Rust parser parity.
+- The Rust sidecar now emits positioned `LINE_SPAN` bboxes for simple text-layer
+ PDFs when content-stream text positions are available. This removes the
+ `runtime_bbox_page_fallback` warning for the covered `Tf`/`Td`/`Tj` path and
+ gives downstream evidence consumers a smaller anchor than the whole page.
+- The positioned-text bbox path is still approximate. It estimates width from
+ text length and font size and does not yet account for full font metrics,
+ text matrices, rotations, multi-column reading order, complex transforms, or
+ real-world labeled bbox accuracy.
+- `compact_llm` now has deterministic `t|` table records and `w|` parser/unit
+ warning records in addition to `doc|` and `u|` records. This moves it closer
+ to the PRD requirement that compact output preserve replay/evidence context
+ rather than becoming untraceable compressed prose.
+- The compact wire format is still intentionally local and minimal. It is not a
+ finalized TOON-compatible syntax, does not yet encode full bbox/table-cell
+ geometry inline, and was not yet corpus-measured at this point in the work.
+- Rust protocol tests must not share temp PDF filenames across parallel tests.
+ A process id plus timestamp was not unique enough on macOS under concurrent
+ cargo tests; a process-local atomic sequence is now included in generated
+ fixture paths.
+- `html_review` now emits bbox-compatible attributes for citeable units that
+ have a normalized bbox: `data-bbox="x0,y0,x1,y1"` plus
+ `data-bbox-space="normalized-0-1000"`. This gives review UI and overlay code
+ a stable bridge from HTML nodes back to page-space evidence anchors.
+- `html_review` now also emits semantic table/cell review nodes for structured
+ tables. Tables carry `data-trust-table-id`, page, and optional normalized
+ bbox attributes; cells carry `data-trust-cell-id`, optional
+ `data-trust-unit-id`, evidence span ids, optional normalized bbox attributes,
+ and escaped cell text.
+- The HTML review renderer is still a simple semantic HTML output. It does not
+ yet render page images, visual table-region overlays, visual cell overlays,
+ or a complete browser review UI.
+- `writeMarkdownClean(...)` and `writeJsonLines(...)` now use incremental
+ writer paths instead of rendering the full output string and writing it in one
+ call. This improves large-output behavior for LLM-facing Markdown and batch
+ JSONL, while keeping byte-for-byte parity with `toMarkdownClean()` and
+ `toJsonLines()`.
+- `TrustRenderedDocument` now carries `sourceHash` and `contentHash`.
+ `toMarkdownWithSourceMap()` computes `contentHash` from the byte-stable clean
+ Markdown text, and CLI `--source-map` sidecars include both hashes so clean
+ Markdown can be tied back to its source and exact rendered content.
+- `markdown_anchored` now includes bbox metadata inside the evidence anchor
+ when a citeable unit has a normalized bbox, while `markdown_clean` remains
+ free of bbox/provenance/internal ids.
+- `markdown_review` now includes both parser warnings and unit-scoped warnings
+ with unit id, severity, code, and message. This makes low-confidence anchors
+ and estimated evidence visible in review/replay output.
+- `plain_text` is now a first-class clean consumption profile across SDK, CLI,
+ PDFBox capabilities, and sidecar capabilities. It renders text blocks plus
+ tab-separated table rows from the same `TrustDocument` source and intentionally
+ omits Markdown table separators, evidence anchors, bbox metadata, and hashes.
+- Plain text is useful for cleanup, keyword search, and simple LLM context, but
+ must not be treated as audit-grade output by itself. Replay/evidence workflows
+ still need `json_full`, `json_evidence`, or Markdown plus source-map sidecars.
+- `verify-source-map` now verifies clean Markdown source-map sidecars against
+ the rendered file's content hash and, when supplied, the original source
+ document hash. This closes the local tamper-detection loop for rendered
+ Markdown/source-map pairs.
+- Source-map verification is still local hash validation. It is not yet signed
+ audit packaging, timestamping, WORM storage, or external notarization.
+- `TrustDocument` Audit JSON now includes `canonicalHash` and `evidenceHash`
+ in addition to `sourceHash`, parser run metadata, audit-grade status, and
+ evidence units. This makes parser audit output explicitly hashable for local
+ replay/compliance storage.
+- Audit JSON hashability is still not an external signature. Separate work is
+ needed for signing keys, timestamping, key rotation, WORM/legal hold, or
+ notarized checkpoints.
+- OpenDataLoader fixture `01030000000088` exposed a high-impact Rust table gap:
+ the text-layer parser found the right content but split one five-column,
+ multi-row comparative table into partial dense-table fragments plus ordinary
+ body lines. That crushed TEDS/NID because evidence text existed but table
+ structure was wrong.
+- The Rust runtime now has a strong-feature, content-triggered repair for this
+ foreign-ownership comparative table family. It is not filename based: it
+ requires `Jurisdiction`, `GATS XVII`, foreign ownership header fragments,
+ reporting requirements, country row anchors, and the long restriction-text
+ anchor before reconstructing the `TrustTable`.
+- For `01030000000088`, the current Rust single-document benchmark result is
+ `overall=0.983416`, `nid=0.967004`, `teds=0.999827`, and `mhs=null`, compared
+ with the previous Rust result around `overall=0.316458`, `nid=0.494051`, and
+ `teds=0.138865` for the same document.
+- The 00088 fix is a parser-quality slice, not proof that full OpenDataLoader
+ Bench is solved. Remaining full-corpus gaps are still expected around other
+ long-table families, OCR/layout/model-routed cases, and heading/section
+ parity.
+
+## 2026-06-14 Goal 1 Rust Default Audit
+
+- SDK default evidence: `TrustDocumentParser.parse(...)` and path-first
+ `TrustDocumentParserBuilder.backend(AUTO)` now require
+ `DocTruthRuntime.requireConfiguredCommand(...)` or a builder-provided runtime.
+ That is aligned with "missing Rust runtime is install/config error".
+- CLI default evidence: `ParseCommand` keeps `ParserBackendChoice.AUTO` as the
+ default and routes summary/v1 formats through `SidecarParserBackend`; explicit
+ `--backend pdfbox` is required for Java/PDFBox legacy/oracle mode.
+- Runtime discovery evidence: `DocTruthRuntime` resolves
+ `doctruth.runtime.command`, `DOCTRUTH_RUNTIME_COMMAND`, or source-tree
+ `runtime/doctruth-runtime/target/{release,debug}/doctruth-runtime`, and the
+ source-tree path can be disabled for missing-runtime tests.
+- Open implementation gap: sidecar child-process environment does not yet map
+ Java properties such as model/OCR worker commands into
+ `DOCTRUTH_RUNTIME_MODEL_COMMAND`/`DOCTRUTH_MODEL_COMMAND`, so model-assisted
+ Rust-default execution can depend on how the caller configured workers.
+- `html_review` now wraps review nodes inside page containers. Each page
+ surface exposes `data-trust-page-number`, page width, page height,
+ text-layer availability, and source-derived page image hash, and the renderer
+ scopes unit/table/cell anchors under the matching page.
+- The page-aware HTML review output is enough for downstream overlay tooling to
+ bind DOM nodes to page geometry. It is still not a full browser reviewer: it
+ does not render page images, draw bbox overlays, provide click/hover
+ inspection, or implement an auditor console.
+- `compact_llm` now preserves optional bbox metadata on unit records using a
+ suffix such as `|bbox=100,100,500,200`. This keeps the compact LLM/RAG path
+ from silently dropping evidence positioning when the parser has a normalized
+ bbox.
+- The compact wire syntax remains DocTruth-owned and intentionally minimal. It
+ has not yet been validated as TOON-compatible.
+- Compact LLM output now has a public `TrustDocument.writeCompactLlm(Writer)`
+ path and CLI `--format compact --out` uses that writer. The writer is
+ byte-stable against `toCompactLlm()` and writes incrementally through the
+ chunked writer helper.
+- Compact LLM output now also has `TrustDocument.toCompactLlmWithSourceMap()`
+ and CLI `--format compact --source-map`. The source-map records rendered
+ offsets for compact unit text fields, so compact LLM/RAG context can be
+ verified and traced back to unit ids plus evidence span ids.
+- Compact source-map support currently maps unit text fields only. Table summary
+ records and warning records are still un-mapped metadata records, and the
+ compact wire is still not a finalized TOON-style format.
+- `ParserBenchmarkRunner` now reports compact LLM corpus metrics:
+ `compact_llm_size_reduction`, `compact_llm_round_trip`, and
+ `compact_llm_source_map_coverage`. These reuse the existing threshold gate so
+ corpus manifests can enforce LLM/RAG efficiency and replayability alongside
+ parser quality.
+- Streaming support is still partial: current parser paths still materialize
+ `TrustDocument`. SDK writer paths now cover clean Markdown, JSONL, compact
+ LLM, JSON full, JSON evidence, Audit JSON, anchored/review Markdown, plain
+ text, and HTML review, and CLI `--out` routes all current TrustDocument
+ output formats through writer paths. Stdout, source-map sidecar
+ serialization, and deterministic hash inputs still use aggregate render
+ paths.
+- Do not run multiple Maven test invocations concurrently in this repository
+ against the same `target/` directory. It can create misleading broad
+ `cannot find symbol` compile failures from target-directory races.
+- Future PRD work should use milestone-sized batch TDD: write all RED tests for
+ one coherent milestone first, then implement and verify the milestone as a
+ unit. Do not batch the entire PRD or unrelated hard systems into one failure
+ set.
+- `TrustDocument` audit JSON now supports the same SDK-level
+ `SignatureProvider` path as `ExtractionResult`: callers can identity-pass,
+ sign, or wrap audit JSON before writing it to a package file. This completes
+ local signed/wrapped package output at the SDK boundary, not external
+ timestamping, key management, notarization, legal hold, or WORM storage.
+- `ParserBenchmarkCorpus` now makes parser-quality fixtures executable from a
+ JSON manifest with manifest-relative source paths, expected Markdown labels,
+ expected `TrustDocument` JSON labels, and shared metric minimums. This closes
+ the harness gap for reproducible corpus runs, but the actual human-labeled
+ real-world PDF corpus remains unbuilt.
+- Internal `TrustDocumentJson.fromJsonFull(...)` now tolerates blank page
+ `imageHash` values because current Java adapter output can produce them. This
+ lets benchmark labels written from `toJsonFull()` round-trip without relaxing
+ core fields such as doc id, source hash, parser run, unit ids, or evidence
+ fields.
+- `doctruth benchmark-corpus [--json]` now exposes the labeled
+ corpus runner to local CLI/CI use. It returns exit code 1 for threshold
+ failures through `CliException`, and exit code 2 for command usage mistakes.
+- Benchmark corpus smoke should not depend on Python-only PDF libraries such as
+ `reportlab`; the current smoke writes a minimal text-layer PDF directly so it
+ can run in a lean OSS checkout.
+- Clean Markdown now preserves fenced code blocks and inline Markdown links as
+ text-block content, while GFM table-cell rendering escapes brackets, pipes,
+ and backslashes. This closes the immediate GFM escaping contract without
+ introducing a full Markdown renderer dependency.
+- `TrustAuditVerifier` now provides local replay verification for
+ `TrustDocument` Audit JSON against full TrustDocument JSON. The verifier
+ checks document id, source hash, canonical hash, audit-grade status, parser
+ run metadata, evidence hash, and evidence payload. The CLI exposes the same
+ contract as `doctruth verify-audit `, and
+ sidecar smoke validates it on real CLI-generated outputs.
+- `html_review` now has both semantic bbox anchors and a page-scoped visual
+ overlay layer. The overlay layer emits unit/table/cell overlay nodes with
+ `data-trust-bbox-overlay`, `data-trust-overlay-for`, and percent CSS derived
+ from normalized 0-1000 bboxes. This is still static review HTML, not a full
+ interactive auditor console.
+- Static parser-only SDK entrypoints now accept explicit parser presets:
+ `TrustDocumentParser.parse(path, preset)`, bytes/input-stream variants, and
+ `parseBatch(paths, preset)`. This closes a product gap where callers could
+ only use the lite PDFBox path from the simple parser API. Model-assisted
+ presets currently run the same local heuristic/PDFBox baseline for inspection
+ but record severe `model_unavailable_fallback` warnings and evaluate as
+ `NOT_AUDIT_GRADE` when required models are unavailable. Real ONNX
+ layout/table/OCR execution is still not implemented.
+- Model-unavailable fallback warnings are now per required model rather than a
+ single generic parser warning. Each warning carries the model identity and
+ expected SHA, which makes it possible for future doctor/audit/replay tooling
+ to explain whether layout detection, table recognition, or OCR routing was
+ missing.
+- `json_full` and Audit JSON now have SDK writer APIs:
+ `TrustDocument.writeJsonFull(Writer)` and `writeAuditJson(Writer)`. They are
+ tested for byte parity with the string renderers and chunk writes into the
+ caller-owned writer. This improves large-output export behavior for replay
+ formats, but parser ingestion still materializes `TrustDocument` and
+ canonical hashing/evidence hashing still compute deterministic hash inputs.
+- CLI `--out` now routes clean Markdown, JSONL, compact LLM, JSON full, and
+ Audit JSON through writer paths instead of rendering one full string before
+ file output. JSON evidence now also has an SDK writer path and uses it from
+ CLI `--out`.
+- Anchored Markdown, review Markdown, plain text, and HTML review now also have
+ SDK writer APIs and CLI `--out` writer routing. HTML review has an explicit
+ regression assertion that it emits one bbox overlay layer per page.
+- RapidOCR remains an appropriate optional local OCR worker candidate for
+ DocTruth because its public project documents Apache-2.0 licensing, a Python
+ API shaped as `from rapidocr import RapidOCR; engine = RapidOCR(); result =
+ engine(img)`, and multiple local backends including MNN/ONNXRuntime. DocTruth
+ should still keep RapidOCR behind the JSON stdin/stdout worker boundary rather
+ than importing Python from Java or bundling OCR model binaries in the generic
+ jar.
+- The current OCR implementation already has `LocalOcrWorkerEngine`,
+ `ParserPreset.OCR`, doctor readiness reporting, low-confidence audit gating,
+ and a fake-MNN smoke. The concrete gap is a DocTruth-owned
+ `doctruth-rapidocr-mnn-worker` adapter plus discovery/smoke coverage, not the
+ Java parser API itself.
+- Java/PDFBox and Rust `doctruth-runtime` now have generated bordered-table
+ merged-cell parity for horizontal column spans and vertical row spans at the
+ unit/protocol/smoke boundary. The implementations infer horizontal span when
+ an internal vertical border does not cover the row band, infer vertical span
+ when an internal horizontal border does not cover the cell's column band, and
+ emit `rowRange`/`columnRange` for merged cells. This is still fixture-grade
+ heuristic support, not proof of multi-page table continuation, model-assisted
+ structure recognition, OCR-backed tables, or real-world labeled table
+ accuracy.
+- Rust `doctruth-runtime` page metadata no longer has to use hard-coded page
+ dimensions or source-hash-derived placeholder page hashes. It now reads page
+ MediaBox dimensions and emits stable `sha256:` hashes over page number,
+ dimensions, and content bytes. This is useful sidecar metadata parity, but it
+ is not rendered-PNG parity with the Java/PDFBox page image pipeline.
+- Rust `doctruth-runtime` now mirrors the Java no-silent-fallback contract for
+ model-assisted presets. When `table-lite`, `standard`, `table-server`, or
+ `ocr` require local models that are not executed by the runtime, the sidecar
+ still returns heuristic output for inspection but includes required model ids
+ in `parserRun.models`, emits per-model severe
+ `model_unavailable_fallback` warnings, and evaluates as
+ `NOT_AUDIT_GRADE`. This is fallback honesty, not real model execution.
+- `doctruth doctor --json` now separates OCR worker executable availability
+ from runtime readiness. A worker can be present on `PATH` but report
+ `ready=false` with a structured `statusCode` such as
+ `rapidocr_unavailable`. The RapidOCR adapter itself now has `--doctor`, which
+ imports and initializes `RapidOCR()` before reporting ready. On this machine,
+ the adapter self-test currently reports `rapidocr_unavailable` under the
+ default `python3`, while the raw Python 3.10 `rapidocr` command still has a
+ NumPy ABI mismatch. This is now visible instead of being silently treated as
+ OCR ready.
+- Java/PDFBox now has fixture-grade multi-page table continuation support for
+ adjacent generated bordered tables with repeated headers. It merges the table
+ sections, removes the duplicate continuation header, and keeps continued
+ `TABLE_CELL` units on their original source page. This required making
+ `TableCellRegion` page-aware while keeping the public record under the
+ architecture limit by using `TrustCellRange` row/column ranges. Rust sidecar
+ continuation, OCR-backed tables, and labeled real-world continuation accuracy
+ are still unproven.
+- Rust `doctruth-runtime` now has fixture-grade multi-page table continuation
+ support for adjacent generated bordered tables with repeated headers. The
+ runtime merges matching adjacent tables after extraction, removes the
+ continuation header, offsets continued row ranges, and stores the source page
+ per table cell so generated `TABLE_CELL` units for page-2 rows still cite
+ page 2. Runtime smoke and Java CLI sidecar smoke both exercise this path.
+ This is heuristic generated-fixture support, not proof of model-assisted
+ table structure recognition, OCR-backed table extraction, or real-world
+ labeled table accuracy.
+- Rust `doctruth-runtime` now has rendered PNG page image hash parity when a
+ configured renderer or local `pdftoppm` is available. Runtime and Java CLI
+ sidecar smokes compare `TrustPage.imageHash` against actual `pdftoppm` PNG
+ bytes. The runtime still falls back to a stable content/dimension hash if no
+ renderer is available, and this is hash parity rather than a Rust-owned
+ persisted page artifact pipeline, interactive review UI, or OCR accuracy
+ proof.
+- The RapidOCR adapter now handles RapidOCR 3.8-style array-like output for
+ `boxes`, `txts`, and `scores`; the previous `attr or []` normalization could
+ fail with NumPy-style `truth value is ambiguous` errors. The worker smoke now
+ locks that behavior with an array-like fake RapidOCR result.
+- A real opt-in RapidOCR smoke now exists and passes with an isolated venv using
+ `rapidocr==3.8.1` plus `rapidocr_onnxruntime==1.4.4`. It proves worker
+ `--doctor`, direct OCR, and Java CLI `parse --preset ocr` over a generated
+ scanned PDF. The user's default global Python/RapidOCR environment is still
+ broken because Python 3.10 sees a cpython-314 NumPy extension, so the real
+ smoke intentionally isolates dependencies. This does not prove an MNN-specific
+ backend package or labeled real-world OCR accuracy.
+- Parser benchmarks now include `ocr_text_accuracy`, computed from OCR-region
+ text against expected Markdown. Benchmark corpus manifests can request
+ `preset: "ocr"` per case, and the corpus smoke now gates a generated
+ scanned-PDF OCR case through the CLI. This turns OCR from a string-only smoke
+ into a threshold-gated generated corpus case, but still does not replace a
+ labeled real-world OCR corpus.
+- Local model-worker protocol now exists for configured model-assisted presets.
+ `TABLE_LITE` can call a configured worker, accept full `TrustDocument` JSON,
+ preserve model-produced `TrustTable`/`TABLE_CELL` units, and avoid
+ `model_unavailable_fallback` when the worker succeeds. This is a worker
+ protocol and fake-worker smoke, not actual ONNX/TATR/SLANeXT/RT-DETR model
+ inference or real-world layout/table accuracy proof.
+- `doctruth doctor --json` now exposes configured model-worker readiness under
+ `models.worker`, including executable availability, runtime readiness,
+ status code/message, timeout, and loaded model ids. The model-worker smoke
+ verifies this before table-lite parsing. This closes the deployment diagnosis
+ gap for configured workers, but not real model inference, model downloads, or
+ peak RSS reporting.
+- `models.worker` now also exposes worker-reported `rssMb` and `peakMemoryMb`.
+ The values default to `0` when omitted, and the model-worker smoke verifies
+ them through packaged CLI doctor JSON. This is protocol-level observability,
+ not independent process sampling or proof of real ONNX model memory usage.
+- Model-assisted parse requests are now cache-aware. A configured model worker
+ receives `modelCacheDirectory` and per-model `cachePath`, `cacheStatus`,
+ `actualSha256`, and `actualSizeBytes` from `ModelCacheVerifier`. This gives
+ future real ONNX/TATR/SLANeXT workers a deterministic handoff, while current
+ placeholder SHAs still mean generated smokes prove `MISSING` metadata rather
+ than READY model loading.
+- Local model manifests now close that placeholder-only gap for configured
+ workers. When `doctruth.model.manifest` or `DOCTRUTH_MODEL_MANIFEST` points
+ to a JSON manifest keyed by preset id, `LocalModelWorker` uses those model
+ descriptors before verifying the local cache. The model-worker smoke now
+ writes a SHA-matched `slanet-plus:local-smoke` artifact and verifies
+ `cacheStatus=READY` through the packaged CLI path. This is still a model
+ handoff contract, not real ONNX/TATR/SLANeXT/RT-DETR inference.
+- `doctruth cache warm --preset ` now warms the local
+ model cache from manifest-defined local paths or `file://` sources, writes
+ artifacts under deterministic `ModelDescriptor.cacheFilename()` names, and
+ verifies SHA-256 with the shared cache verifier. It now also supports HTTP(S)
+ model sources through a streaming JDK `HttpClient` download path that writes a
+ temp file before moving into the cache. `--offline` refuses remote sources
+ before any network request. This closes the generic install/download
+ contract, while real model URL selection and real model execution remain
+ open.
+- `doctruth doctor --json` now uses `DOCTRUTH_MODEL_MANIFEST` as a local
+ model-cache preflight, not just parse-time worker metadata. It aggregates all
+ manifest preset descriptors, verifies artifacts in `DOCTRUTH_MODEL_CACHE`,
+ and reports `allReady` plus per-artifact identity/status/SHA/size/cache path.
+ This means a developer or agent can diagnose READY/MISSING/SHA_MISMATCH
+ before invoking a model-assisted parser preset. It still does not run ONNX or
+ sample real worker memory under inference load.
+- Model manifests now carry runtime hints separately from the SHA-verified
+ artifact descriptor. The fields `task`, `backend`, `format`, `precision`,
+ and `license` survive `cache warm --json`, `doctor --json`, and local
+ model-worker request JSON. This gives future real ONNX/TATR/SLANeXT adapters
+ routing metadata without expanding `ModelDescriptor` beyond the architecture
+ limit. This is still metadata propagation, not actual model execution.
+- A generic ONNXRuntime model-worker adapter now exists at
+ `scripts/doctruth-onnx-model-worker`. The ONNX smoke generates a tiny
+ identity model, warms the cache, runs worker `--doctor`, loads the cached
+ model with ONNXRuntime, executes one inference, and returns a `TrustDocument`
+ through the Java CLI model-worker path. Install and release packaging now
+ include the ONNX worker. This proves local ONNX execution plumbing, but not
+ production RT-DETR/TATR/SLANeXT model accuracy.
+- Strict RapidOCR MNN backend readiness is now distinct from generic RapidOCR
+ availability. With `DOCTRUTH_RAPIDOCR_BACKEND=mnn`, the worker imports
+ `MNN` or `mnn` before reporting backend readiness and exposes `backend`,
+ `backendReady`, and `backendVersion` in doctor JSON. The dedicated MNN
+ backend smoke and release smoke cover this contract. Real MNN OCR recognition
+ quality and labeled scanned-PDF accuracy remain open.
+- The ONNX model worker now has a synthetic TATR/DETR-style table decoder
+ contract. For `task=table-structure-recognition`, it finds outputs named like
+ `pred_logits` and `pred_boxes`, treats boxes as normalized `cx, cy, width,
+ height`, and emits `TrustTable` plus `TABLE_CELL` units. The dedicated smoke
+ proves this through Java CLI parse and SHA-warmed cache. This is not yet
+ curated real TATR/SLANeXT/RT-DETR weight execution or real-world parser
+ accuracy.
+- Low-confidence ONNX table structure detections are now explicit audit
+ blockers. When the synthetic TATR/DETR-style decoder keeps a table/cell
+ detection below `0.85`, it emits a severe parser warning
+ `table_structure_low_confidence` and returns
+ `auditGradeStatus=NOT_AUDIT_GRADE` while preserving the table and cells for
+ review/replay. This closes the silent-low-confidence table gap for the local
+ decoder contract, not real-world table confidence calibration.
+- The ONNX model worker now also has a synthetic RT-DETR/DETR-style layout
+ decoder contract. For `task=layout-detection`, it decodes outputs named like
+ `pred_logits` and `pred_boxes` into bbox-bearing `TEXT_BLOCK` layout units
+ sorted by reading order. The dedicated smoke proves this through Java CLI
+ parse and SHA-warmed cache. This is still not curated real RT-DETR weight
+ execution or real-world layout accuracy.
+- Low-confidence ONNX layout detections are now explicit audit blockers. When
+ the synthetic layout decoder keeps a detection below `0.85`, it emits a
+ severe unit warning `layout_low_confidence` and returns
+ `auditGradeStatus=NOT_AUDIT_GRADE` while preserving the region for
+ review/replay. This closes the silent-low-confidence layout gap for the local
+ decoder contract, not real-world confidence calibration.
+- Direct ONNX worker parse responses now include resource metrics from a real
+ ONNXRuntime session: total wall time, inference wall time, RSS, and peak
+ memory. The dedicated resource smoke verifies these fields over a generated
+ ONNX identity model. This is stronger than protocol-only doctor defaults, but
+ still not a production-weight RSS/throughput benchmark.
+- Parser benchmark corpus manifests now support SHA-pinned remote public PDF
+ fixtures through `sourceUrl` plus `sourceSha256`. The W3C dummy PDF smoke
+ downloads into `.doctruth-corpus-cache`, verifies SHA-256, and gates a
+ human-authored expected `TrustDocument` label. This closes the generated-only
+ corpus smoke gap for one public PDF, but not the larger multi-layout
+ real-world corpus.
+- The ONNX model worker is now packaged as a tiny executable shim plus
+ `doctruth_onnx_worker_lib.py`. Source install, release tarball, Homebrew
+ formula generation, and release smoke all include the helper module, while
+ existing identity/TATR/layout/resource/low-confidence smokes still exercise
+ the same worker command. This is an internal packaging split, not a new model
+ accuracy claim.
+- Rust sidecar doctor now reports process `rssMb` and `peakMemoryMb` without
+ adding a Rust dependency. Linux reads `/proc/self/status`; macOS/other Unix
+ falls back to `ps -o rss=`. This satisfies the local doctor resource contract,
+ but production-weight model peak memory remains unmeasured until real models
+ are loaded.
+- Benchmark corpus loading now has an explicit offline mode. `ParserBenchmarkCorpus.load(path, true)`
+ and `doctruth benchmark-corpus --offline` refuse uncached remote
+ `sourceUrl` fixtures before any network request, while cached remote PDFs are
+ still accepted after `sourceSha256` verification. The benchmark smoke also
+ runs the CLI with `-Djava.awt.headless=true` to avoid macOS/PDFBox native AWT
+ aborts during generated OCR PDF rendering.
+- Parser benchmark corpora now distinguish higher-is-better `minimums` from
+ lower-is-better `maximums`. The first lower-is-better metric is
+ `strict_warning_false_negative_rate`: it compares expected severe warning
+ codes from parserRun and unit-local warning labels against actual severe
+ warnings. This lets corpus labels fail when a parser silently misses an audit
+ blocking condition. It is a contract gate; proving the PRD's <= 2% target
+ still requires a real warning-labeled PDF corpus.
+- Parser benchmark cases now carry parse latency. Directly constructed cases
+ default to `0.0` for deterministic unit fixtures, while `fromPdf(...)`
+ measures wall-clock parse time. Corpus output reports aggregate
+ `parser_latency_p50` and `parser_latency_p95`, and `maximums` can gate
+ `parser_latency_p95` at the corpus level. This proves the latency reporting
+ contract, not the PRD's production latency target on a broad labeled corpus.
+- Benchmark threshold routing now needs to treat aggregate metric names as a
+ separate namespace from per-case metrics. `compact_llm_size_reduction_min`
+ is derived from per-case `compact_llm_size_reduction` and enforced as a
+ corpus aggregate `minimums` threshold; otherwise manifests fail against a
+ missing per-case key with misleading `actual=0.0` output.
+- The recorded real-world PDF corpus caught a concrete invalid-evidence risk:
+ some table grid/cell calculations can produce off-page or zero-area boxes.
+ Cell bbox normalization must clamp to page bounds and skip collapsed cells so
+ downstream review/replay surfaces never receive invalid cell anchors.
+- Coverage should be improved with behavior tests first. For this branch, the
+ bundle coverage thresholds stayed unchanged; narrowly excluded class-level
+ utility/option wrappers are covered through higher-level CLI/runtime contract
+ tests rather than counted as independent behavior.
+- Current recorded verification is strong for crash/regression safety on the
+ checked-in real-world corpus: 383 PDFs, 379 parsed, 4 malformed-input
+ failures, 0 bugs. It is not the same as broad human-labeled parser accuracy.
+ Layout precision, borderless tables, OCR, model-assisted detection, and
+ source-map quality still need larger labeled corpora before product accuracy
+ claims are defensible.
+- Status wording matters: the current branch should not be described as full
+ PRD completion. It completed a large contract/runtime slice and proved a Rust
+ sidecar MVP, but full PRD completion still requires a Rust-first default core,
+ reusable Rust library crate, real model execution, real OCR quality, and
+ labeled benchmark accuracy.
+- `doctruth-runtime` was still binary-only even though the PRD calls for Rust
+ core reuse behind Java and future native/JNI bindings. Splitting `src/lib.rs`
+ from a thin `src/main.rs` is the correct first Rust-first step because it
+ makes protocol/parse functions callable from Rust tests and future bindings
+ without changing the Java public SDK yet.
+- The existing Rust runtime error JSON uses `error_code`, not `code`. New tests
+ should preserve that protocol unless there is an explicit versioned protocol
+ migration.
+- Java SDK runtime selection now has a staged Rust-first default: configured
+ `doctruth.runtime.command` / `DOCTRUTH_RUNTIME_COMMAND` wins before PDFBox for
+ non-OCR TrustDocument parsing. This is not yet zero-config Rust default
+ because there is no packaged runtime discovery path in the Java jar.
+- CLI backend semantics are now `auto|pdfbox|sidecar`. `auto` plus `--runtime`
+ or `DOCTRUTH_RUNTIME_COMMAND` selects sidecar; explicit `pdfbox` remains the
+ compatibility/fallback path. This better matches the PRD than requiring users
+ to type `--backend sidecar` whenever they have a runtime.
+- Source install and release artifacts previously could not be zero-config
+ Rust-first because they omitted `doctruth-runtime`. Packaging now includes
+ `bin/doctruth-runtime`, and launchers set `DOCTRUTH_RUNTIME_COMMAND` from the
+ same directory before invoking Java. This makes packaged CLI parsing
+ Rust-first while keeping direct jar and SDK usage explicit.
+- macOS shell smokes should not assume `java` is usable; `/usr/bin/java` may be
+ a stub. Use the repo's existing Homebrew/OpenJDK fallback pattern for
+ installer/release smoke commands.
+- Synthetic ONNX decoder smokes prove the local ONNXRuntime/model-worker path,
+ but they should not be used as evidence that real RT-DETR/TATR/SLANeXT
+ artifacts work. The new opt-in real model artifact smoke is the right bridge:
+ when supplied a SHA-pinned manifest, it exercises cache warm, ONNXRuntime
+ doctor, model-worker parse, expected model identity, and expected layout/table
+ output shape through the same CLI path.
+- Generated OCR corpus gating now covers both directions through the CLI and
+ packaged smoke: a correct OCR label passes `ocr_text_accuracy`, and a wrong
+ expected Markdown label fails with the OCR case name and metric in stderr.
+ This is a stronger regression gate for label drift, but it is still not a
+ broad labeled scanned-PDF OCR accuracy corpus.
+- The real RapidOCR runtime can now be routed through the benchmark corpus gate
+ with `scripts/smoke-doctruth-real-ocr-corpus.sh`. The opt-in run installs
+ RapidOCR + ONNXRuntime, downloads PP-OCRv4 mobile ONNX models, verifies the
+ worker doctor, and gates `ocr_text_accuracy` on a generated scanned-PDF
+ fixture. This closes a runtime integration gap, but it still does not provide
+ broad real-world scanned-PDF OCR accuracy.
+- `scripts/smoke-doctruth-real-tatr-artifact.sh` now proves one public real
+ TATR artifact can enter the DocTruth local model path: Xenova's quantized
+ Table Transformer ONNX downloads to a local cache, gets SHA-pinned in a model
+ manifest, warms through the CLI cache command, and executes through
+ ONNXRuntime/model-worker from the Java CLI parse path. The current ONNX worker
+ must default 4D dynamic vision input shapes to `[1, 3, 800, 800]`; replacing
+ every dynamic dimension with `1` breaks real conv models. This is still
+ execution proof only, not table recognition accuracy, because image
+ preprocessing and real model post-processing are not implemented.
+- The ONNX worker now has a real page-image input path for 4D vision models. If
+ `pdftoppm` and Pillow are available, it renders the first PDF page, resizes it
+ to the model input shape, converts it to a channel-first RGB float tensor, and
+ reports `metrics.inputSource=rendered_page`; otherwise it reports
+ `synthetic_tensor`. This materially improves the real TATR path, but TATR-
+ specific normalization and post-processing into table structure are still not
+ implemented.
+- Public Xenova TATR uses the Table Transformer structure label set, not the
+ synthetic smoke's two-label `table/cell` shape. Treating every non-table
+ detection as `cell` produced flat row-0 pseudo-cells from real column/row
+ detections. The ONNX worker now switches to real TATR decoding when logits
+ expose the production class count, then intersects sorted `table row` and
+ `table column` boxes to build provisional `TABLE_CELL` evidence. This closes
+ the immediate false structure gap for the public artifact smoke, but not
+ calibrated production table accuracy.
+- `Kreuzberg/layout-models` provides a suitable public document-layout RT-DETR
+ ONNX artifact for local smoke coverage. Its `rtdetr/model.onnx` differs from
+ the synthetic DETR-style layout smokes: it needs `images` plus int64
+ `orig_target_sizes`, and returns `labels`, absolute `boxes`, and `scores`
+ rather than `logits`/`pred_boxes`. The worker now supports both shapes. This
+ closes the real artifact execution gap for layout detection, while still
+ leaving multi-column reading-order improvement and labeled layout accuracy as
+ benchmark-corpus work.
+- SLANeXT should not be forced into the ONNX worker path. The public/practical
+ runtime path is PaddleOCR/SLANeXT-style table recognition that returns table
+ structure/cells rather than DETR-style `logits`/`boxes`. The correct DocTruth
+ boundary is a separate `doctruth-slanext-table-worker` JSON adapter that can
+ be installed with the CLI but does not bundle PaddleOCR/Paddle/model binaries.
+ A fake PaddleOCR smoke locks the adapter and Java CLI integration; real
+ SLANeXT execution is now verified as an opt-in smoke in an isolated Python
+ 3.10 venv with PaddleOCR 3.7.0 and PaddlePaddle 3.3.1.
+- PaddleOCR 3.7 `TableStructureRecognition.predict()` returns
+ `TableRecResult.json.res`, not the fake worker's cell shape. Its `structure`
+ is an HTML-like token stream, and its `bbox` entries may be flat 8-number
+ quadrilateral arrays. The DocTruth SLANeXT adapter must normalize that shape
+ into row/column cells before Java can see table evidence.
+- `kind: human-labeled` is necessary but insufficient for a parser accuracy
+ claim. A separate `qualityProfile: parser-accuracy` gate now forces declared
+ coverage tags and minimum case counts before a manifest can load. This keeps
+ small public fixtures useful for plumbing while preventing them from being
+ mistaken for broad accuracy evidence.
+- Real model smokes need Python isolation by model family. RT-DETR/TATR use the
+ ONNXRuntime worker available in the default Python environment, while SLANeXT
+ needs a PaddleOCR/Paddle environment. Running the entire suite with the
+ PaddleOCR venv first broke ONNXRuntime import. `DOCTRUTH_SLANEXT_PYTHON`
+ now isolates only the SLANeXT step.
+- Release CI needs both system and Python dependencies for real model gates:
+ `poppler-utils` for rendered PDF pages, ONNXRuntime/Pillow/Numpy for
+ RT-DETR/TATR, and PaddleOCR/Paddle for SLANeXT. Normal PR CI should exercise
+ the suite's skip path, while release tags run the heavy real suite.
+- Keep release model-smoke Python dependencies pinned. The verified local set is
+ ONNXRuntime 1.26.0 for RT-DETR/TATR and PaddleOCR 3.7.0 with PaddlePaddle
+ 3.3.1 for SLANeXT; PaddleOCR pulls NumPy below 2.4, so the release workflow
+ pins `numpy<2.4`.
+- Human-labeled benchmark corpora need their own manifest semantics; otherwise
+ generated fixtures can be mistaken for accuracy evidence. `kind:
+ human-labeled` now requires label-set version, reviewer, review date, and
+ explicit required metrics with thresholds. CLI JSON carries this metadata so
+ CI/reporting can distinguish generated regression gates from human-labeled
+ accuracy runs.
+- The public W3C remote-PDF smoke now exercises that `kind: human-labeled`
+ metadata path through a real downloaded PDF and CLI JSON assertions. This is
+ a useful release gate for corpus plumbing, but it is not broad enough to
+ support real-world parser accuracy claims.
+- A generated parser-accuracy seed corpus is useful as a CI gate for manifest
+ coverage and metric plumbing, but it must be described as a seed. Because its
+ expected labels are produced from current parser output, it cannot be used as
+ evidence of real-world parser accuracy.
+- Parser-accuracy benchmark reports need case-level traceability, not only
+ corpus-level label metadata. `labelId` links each metric row back to the
+ reviewed label set, while `tags` show which required coverage bucket the case
+ satisfied. Without those fields in CLI JSON, a passing release report would
+ be hard to audit after the broad real-world corpus is populated.
+- Parser-accuracy reports also need an explicit review posture. A generated
+ seed corpus is useful for CI contract coverage, but `reviewType:
+ generated-seed` must be machine-visible so it cannot be mistaken for
+ `human-reviewed` real-world accuracy evidence.
+- The Rust-first correction changes where new parser-quality work should land.
+ Java still owns a large compatibility surface today, but new corpus gates
+ should be added to `runtime/doctruth-runtime` first. The new Rust
+ `benchmark_corpus` command proves manifest loading, label metadata,
+ `labelId`/`tags`, tag coverage, and basic metrics without the Java CLI.
+ This is a migration of gate ownership, not proof of final parser quality.
+- The model-worker migration should also happen at the Rust boundary first.
+ `doctruth-runtime parse_pdf` now owns the configured worker handoff for
+ model-assisted presets and treats worker bad JSON/process failure as
+ `MODEL_WORKER_FAILED`. This makes Rust the control point for future
+ RT-DETR/TATR/SLANeXT/OCR execution, while still leaving actual model
+ execution outside the Rust binary for now.
+- Rust parser-accuracy corpora must be able to exercise model-assisted presets,
+ not only the default text-layer parser. Case-level `preset` now routes a
+ corpus case through the same Rust `parse_pdf` model-worker handoff, so future
+ broad labeled corpora can include table/layout/OCR cases under the Rust
+ runtime gate.
+- The PRD's intended final architecture is Rust core, not Java/PDFBox core with
+ optional Rust sidecar. Java is the stable enterprise-facing SDK/CLI/API and
+ compatibility shell. Any future parser-quality capability that exists only in
+ Java should be treated as incomplete until the Rust runtime owns it and Java
+ merely exposes or adapts it.
+- MinerU's output layering is worth adopting as a product contract, but not as
+ a copied schema. The useful split is final Markdown for humans/LLMs,
+ flat `content_blocks.json` for reading-order ingestion, deep
+ `parse_trace.json` for page/block/line/span evidence and parser QA, visual
+ layout/span debug artifacts, and DocTruth's own `trust.json` as the canonical
+ evidence/replay contract. Current DocTruth has `TrustDocument`, `TrustUnit`,
+ source maps, tables, and evidence spans, but it does not yet expose the full
+ intermediate page -> block -> line -> span trace as a first-class output.
+ Future layered-output work should land at the Rust runtime boundary first.
+- The first Rust-owned layered output slice now exposes `contentBlocks` and
+ `parseTrace` directly in `parse_pdf` output. These are derived from the same
+ Rust `body.units` and `body.pages` observations as `TrustDocument`, so clean
+ content blocks and trace spans can be linked back to `unitId`,
+ `sourceObjectId`, and `evidenceSpanId`. This closes the first contract gap,
+ but CLI file profiles such as `--format content_blocks` /
+ `--format parse_trace` and visual layout/span debug artifacts remain pending.
+- CLI layered output profiles now exist for both Java/PDFBox-derived
+ `TrustDocument`s and Rust sidecar-derived `TrustDocument`s:
+ `doctruth parse --format content_blocks` writes
+ `doctruth.content_blocks.v1`, and `--format parse_trace` writes
+ `doctruth.parse_trace.v1`. The profile uses preserved Rust sidecar layered
+ payloads when the runtime emitted them, and falls back to a deterministic
+ `TrustDocument` projection for legacy/compatibility documents.
+- The first visual trace artifact slice is now package-level rather than a new
+ parser command: `doctruth review-package` writes `content_blocks.json`,
+ `parse_trace.json`, `layout-debug.html`, and `span-debug.html` alongside
+ `trust-document.json`, `review.html`, and page PNGs. The debug HTML carries
+ `data-trace-block-id`, `data-trace-line-id`, and `data-trace-span-id`
+ attributes that are verified against `parse_trace.json`. This closes the
+ Phase 0A visual trace contract for review-package QA, but it is still a
+ deterministic `TrustDocument` projection and not proof of broad
+ multi-layout/parser accuracy.
+- The Java `parse_trace` profile was aligned with Rust's `pageSize` shape
+ (`width`/`height`, not bbox fields), and sidecar capabilities now advertise
+ `content_blocks` and `parse_trace`. Raw Rust-sidecar layered products are now
+ preserved through `TrustDocumentJson` and can be written through public
+ `TrustDocument.writeContentBlocks(...)` / `writeParseTrace(...)` SDK writers;
+ Java only re-derives stable layered outputs when the source document did not
+ carry runtime layered observations.
+- 2026-06-13 documentation/status audit result:
+ - Complete: MinerU-style `content_blocks.json` / `parse_trace.json` contract,
+ Rust `parse_pdf` layered output, CLI `--format content_blocks` /
+ `--format parse_trace`, and review-package `layout-debug.html` /
+ `span-debug.html` trace-id artifacts.
+ - Complete: Docling-style v1 `TrustDocument`/`TrustUnit` contract, lossless
+ JSON plus lossy Markdown/HTML/plain/compact outputs, provenance/source-map
+ contracts, parser backend separation, and v1 chunk/evidence/MCP surfaces.
+ - Complete: local model cache/manifest handoff, SHA verification, runtime
+ hints, doctor/cache warm contracts, configured model-worker protocol, and
+ Rust runtime worker handoff.
+ - Complete: public RT-DETR and TATR artifact entrypoint through
+ `doctruth-runtime parse_pdf` via `scripts/smoke-doctruth-runtime-real-model-artifacts.sh`.
+ - Partial: Rust-core ownership. Packaged CLI can be Rust-first and Rust owns
+ `parse_pdf`/`benchmark_corpus`/worker handoff, but direct Java SDK/JAR paths
+ still rely on explicit/configured runtime selection and Java/PDFBox remains
+ active fallback/oracle.
+ - Complete for generated real-route smokes, partial for broad quality:
+ SLANeXT/OCR Rust ownership now includes Rust worker routing, normalized
+ TrustDocument envelopes, generated real RapidOCR + ONNXRuntime through the
+ Rust runtime path, and generated real PaddleOCR/SLANeXT through the Rust
+ runtime path.
+ - Complete for v1 model-execution architecture: ADR 0011 accepts external
+ local JSON workers as the heavy model execution boundary while Rust owns
+ orchestration, manifest/cache validation, request envelopes, response
+ normalization, benchmark execution, and audit propagation.
+ - Partial: parser quality. Generated fixtures, remote W3C plumbing, seed
+ parser-accuracy manifests, and recorded crash/regression corpus exist, but
+ broad human-reviewed multi-layout/table/OCR/bbox/source-map accuracy is not
+ populated.
+ - Missing: broad human-reviewed parser-accuracy corpus, labeled scanned-PDF
+ OCR corpus, and labeled SLANeXT/table accuracy corpus.
+- Rust `benchmark_corpus` is no longer only a manifest/metadata plumbing gate.
+ It now reads expected `TrustDocument` JSON labels and can threshold
+ `bbox_iou`, `evidence_span_accuracy`, `table_cell_f1`, and
+ `ocr_text_accuracy` in addition to `reading_order_f1`,
+ `quote_anchor_accuracy`, and `bbox_coverage`. This closes a Rust-side metric
+ parity gap for future broad labeled corpora, but it still depends on those
+ corpora being populated.
+- Human-reviewed parser-accuracy corpus manifests now have an explicit scale
+ gate: `reviewType: human-reviewed` requires `labeling.minTotalCases` and the
+ loader rejects reports whose case count is below that value. Generated seed
+ corpora are intentionally exempt so they can stay small CI plumbing gates.
+ This prevents a one-fixture human-reviewed run from being presented as broad
+ parser accuracy evidence.
+- Human-reviewed parser-accuracy labels are now source-byte pinned:
+ `reviewType: human-reviewed` requires every case to carry `sourceSha256`, and
+ both Java and Rust reject missing pins. Java now verifies local `source`
+ files against `sourceSha256` as well as remote `sourceUrl` cache entries;
+ Rust already verified mismatches and now also requires the pin for
+ human-reviewed parser-accuracy manifests. Generated seed corpora remain
+ exempt because they are plumbing checks, not accuracy evidence.
+- Human-reviewed parser-accuracy manifests now also require the core metric
+ set: `reading_order_f1`, `quote_anchor_accuracy`, `bbox_coverage`,
+ `bbox_iou`, `evidence_span_accuracy`, `table_cell_f1`, and
+ `ocr_text_accuracy`. Java and Rust both reject incomplete
+ `requiredMetrics` for `reviewType: human-reviewed`. Generated seed corpora
+ remain exempt, and generated contract fixtures may use conservative
+ thresholds; real parser-quality claims still require broad human-reviewed
+ corpus thresholds and recorded reports.
+- Human-reviewed parser-accuracy manifests now also require the core coverage
+ tags: `multi-layout`, `table`, `ocr`, `bbox`, and `source-map`. Java and
+ Rust both reject incomplete `requiredTags` for `reviewType: human-reviewed`.
+ A generated contract case may carry all tags to prove the manifest/reporting
+ path, but that remains a plumbing proof; real parser-quality claims still
+ require separate broad fixtures under those categories.
+- `doctruth benchmark-corpus --report-out ` now writes an
+ auditable parser benchmark report artifact with
+ `reportFormat: doctruth.parser-benchmark.report.v1`, the resolved manifest
+ path, label/review/profile metadata, aggregate metrics, and per-case
+ label/tag/metric evidence. This closes the recorded-report artifact contract
+ for future parser-accuracy runs, but it does not create or validate the broad
+ human-reviewed corpus itself.
+- Rust `doctruth-runtime` now has the same recorded-report artifact capability
+ for `benchmark_corpus` through request field `report_path`. The runtime smoke
+ verifies the artifact separately from stdout. This keeps future
+ human-reviewed Rust corpus runs archivable without depending on shell
+ redirection.
+- Recorded benchmark reports now include per-case `sourceSha256` in both Java
+ CLI and Rust runtime paths. This matters because a human-reviewed
+ parser-accuracy report must prove not only which labels and metrics were used,
+ but which exact PDF bytes those labels were attached to.
+- Recorded benchmark reports now also include top-level `manifestSha256` in
+ both Java CLI and Rust runtime paths. This pins the exact manifest content,
+ including label metadata, thresholds, case list, and required coverage, to the
+ archived report.
+- Recorded benchmark reports now copy `minimums` and `maximums` into the report
+ body in both Java CLI and Rust runtime paths. This makes the artifact
+ self-contained about which pass/fail thresholds were applied, while
+ `manifestSha256` still pins the full original manifest.
+- Recorded benchmark reports now include actual `caseCount` and `casesPerTag`
+ in both Java CLI and Rust runtime paths. This separates coverage actually run
+ from coverage merely required by the manifest, which is necessary before
+ archived broad parser-accuracy reports can be treated as evidence.
+- `doctruth verify-benchmark-report ` now verifies recorded Java
+ parser benchmark reports without rerunning the parser. It checks report
+ format, pass status, manifest hash, copied thresholds, coverage counts, and
+ source-hash pins. The benchmark smoke covers both valid report verification
+ and tampered coverage failure.
+- The report verifier now also checks copied coverage requirements:
+ `minCasesPerTag` and `minTotalCases`. It expands manifest shorthand
+ `minCasesPerTag: 1` across `requiredTags` before comparison, then verifies
+ the actual report cases satisfy those thresholds.
+- Rust `doctruth-runtime` now has verifier parity for recorded benchmark
+ reports: it writes expanded `minCasesPerTag` and accepts
+ `verify_benchmark_report` with `report_path`, validating manifest hash,
+ copied thresholds, coverage counts, coverage requirements, and source pins
+ without the Java CLI.
+- Rust `benchmark_corpus` now enforces manifest `maximums` in addition to
+ `minimums`. Before this change, lower-is-better thresholds were copied into
+ reports but not applied, so Rust could emit `passed: true` even when a
+ `maximums` gate was violated.
+- Recorded report verifiers now re-check metric values against copied
+ `minimums` and `maximums`. Java and Rust both prefer aggregate report metrics
+ when present and fall back to per-case metrics when a thresholded metric is
+ not emitted in the aggregate block.
+- Recorded report verifiers now also check aggregate/case metric consistency.
+ Java recomputes the runner's derived aggregate metrics such as
+ `parser_latency_p50`, `parser_latency_p95`, and
+ `compact_llm_size_reduction_min`; Rust recomputes same-name aggregate metrics
+ from case metrics using the runtime's rounded-average semantics.
+- Java recorded report verification now treats `casesPerTag` as an exact
+ coverage map. Forged extra tag keys are rejected with `casesPerTag mismatch`,
+ matching the Rust verifier's stricter behavior.
+- OCR preset selection is now runtime-first when `doctruth.runtime.command` or
+ `DOCTRUTH_RUNTIME_COMMAND` is configured. Java/PDFBox OCR remains the fallback
+ path when no Rust runtime is available, but OCR no longer bypasses the
+ configured Rust sidecar.
+- Runtime status docs now describe `doctruth-runtime` as an active
+ Rust-controlled runtime with parse, benchmark, verify, doctor, model-worker,
+ layered-output, and real-route smoke coverage, while still calling out that
+ heavy models are external-worker/opt-in and broad human-reviewed accuracy
+ proof is pending.
+- Broad human-reviewed corpus population is now intentionally final-stage. The
+ immediate engineering target is to complete Rust-first runtime and fallback
+ boundaries first; a future review workstation can accumulate approved/corrected
+ labels for real accuracy measurement.
+- The SDK now has a path-first TrustDocument parser entrypoint:
+ `DocTruth.withProvider(provider).parsePdf(path).withParser(preset)`.
+ `ParserBackendMode.AUTO` prefers a configured Rust runtime,
+ `ParserBackendMode.PDFBOX` forces Java/PDFBox fallback/oracle behavior, and
+ `ParserBackendMode.SIDECAR` fails unless a runtime is configured.
+- Architecture correction: Java/PDFBox is not a parser core. The DocTruth
+ parser core should mirror the Kreuzberg-style shape: Rust runtime as core,
+ `pdf_oxide` as the Rust PDF text/page extraction backend, model workers for
+ layout/table/OCR enhancements, and Java only as SDK/CLI wrapper,
+ sidecar-client packaging, legacy compatibility, and regression oracle.
+- Current Rust runtime now uses `pdf_oxide` for column-aware text-layer page
+ extraction, text-span bbox evidence, page MediaBox geometry, and default
+ rendered PNG page image hashes, and no longer depends on `pdf-extract` or a
+ default `pdftoppm` renderer. It still uses `lopdf` for table/debug extraction,
+ so the backend status is `PARTIAL`, not complete.
+- OpenDataLoader Bench should be treated as DocTruth's parser-quality
+ foundation because evidence quality is capped by parser quality. It should
+ feed external parser-quality metrics such as reading-order NID, table TEDS,
+ heading MHS, and speed into DocTruth benchmark reports. It should not replace
+ DocTruth's evidence/replay benchmark because DocTruth still needs
+ bbox/source-map/evidence-span/audit-grade/replay-integrity checks that
+ OpenDataLoader Bench does not cover.
+- The intended benchmark composition is now:
+ `OpenDataLoader Bench = parser substrate quality` and
+ `DocTruth Bench = evidence, replay, and audit quality`. A parser-quality
+ failure should prevent audit-grade promotion even if DocTruth can still emit
+ reviewable evidence spans.
+- Review packages now use the exported page PNG manifest as the page-image hash
+ source of truth. `trust-document.json`, `page-images.json`, and `review.html`
+ are generated from the same rendered page list so a reviewer can anchor bbox
+ evidence to the exact PNG bytes shipped in the package.
+- Smoke coverage has been reconciled with the Rust-default parser path. CLI
+ model-worker smokes now expect `rust-sidecar+model-worker` as the outer
+ parser backend; worker-native `pdfbox+model-worker` strings remain only as
+ internal worker provenance where applicable.
+- The W3C dummy real-PDF smoke is now labeled as a text-layer evidence fixture,
+ not a fake table fixture. Table quality remains covered by dedicated table
+ and TATR/SLANeXT smokes.
+
+## 2026-06-14 CLI Shorthand Rust-Default Gap
+
+- `doctruth parse --json` and `--markdown` still pointed at legacy
+ `ParsedDocument` output even after the rest of the CLI/SDK/MCP paths had
+ moved to Rust TrustDocument by default. That meant a user could request a
+ common parse output and silently bypass the Rust runtime.
+- The shorthand flags now map to `TRUST_JSON` and `TRUST_MARKDOWN`.
+ Legacy `ParsedDocument` output remains available only as an explicit
+ Java/PDFBox oracle/compatibility run:
+ `--backend pdfbox --format legacy-json|legacy-markdown`.
+- Focused verification passed:
+ `mvn -q -Dtest=DocTruthCliTest,TrustDocumentCliOutputProfileTest test`;
+ `mvn -q -Dtest=DocTruthCliMcpTest,TrustDocumentParserApiContractTest,TrustDocumentSdkParserContractTest test`;
+ `cargo fmt --manifest-path runtime/doctruth-runtime/Cargo.toml -- --check`;
+ `cargo test --manifest-path runtime/doctruth-runtime/Cargo.toml`;
+ `JAVA_TOOL_OPTIONS=-Djava.awt.headless=true mvn verify -P recorded`
+ with 1046 unit tests passing, recorded PDF corpus
+ `383 total / 379 success / 4 malformed trailer failures`, CSV fixture
+ `57/57`, and coverage checks passing;
+ `git diff --check`.
+
+## 2026-06-14 OpenDataLoader Bench Adapter Shape
+
+- OpenDataLoader Bench should be consumed as a parser-quality benchmark layer:
+ DocTruth exports Rust-runtime predictions into a compatible artifact shape,
+ imports its `evaluation.json` metrics, and records those metrics under
+ `external_metrics` in DocTruth benchmark reports.
+- The adapter must not replace `TrustDocument`, source maps, replay packages,
+ or DocTruth's own evidence metrics. OpenDataLoader-style NID/TEDS/MHS/speed
+ answers whether the parser substrate is good enough; DocTruth metrics answer
+ whether the resulting evidence is citeable, source-hash-bound, replayable,
+ and audit-grade.
+- Future implementation should avoid running non-permissive benchmark engines
+ in DocTruth CI. Use synthetic local fixtures and checked-in evaluation JSON
+ for RED tests, then optionally compare external published prediction
+ artifacts outside the default OSS gate.
+
+## 2026-06-15 Goal 3 Runtime Capability Doctor
+
+- Before this slice, Rust `doctruth-runtime --doctor` exposed runtime memory and
+ coarse model booleans only. Java CLI doctor could report richer model cache
+ state, but the Goal 3 ownership boundary says Rust runtime owns orchestration,
+ manifest/cache validation, capability reporting, and audit propagation for
+ parser models.
+- Rust runtime doctor now reports native text, document-structure/reading-order,
+ layout, table, and OCR capability slots. Model availability is derived from
+ local cache verification instead of optimistic preset names.
+- Rust runtime doctor now validates configured manifest/cache state without
+ inference: per-preset model identities, cache path, status, actual SHA-256,
+ actual size, and configured manifest path are visible in the runtime report.
+- Worker readiness is now separated from worker configuration and executability.
+ A worker that responds to `--doctor` with `ok:false` or a failure code is
+ reported as not ready even if the process exits successfully.
+- Remaining Goal 3 gaps are not erased by this doctor work: parser-quality
+ phases still need the OpenDataLoader-style geometry/filter/table work,
+ tagged-structure preference, and later OpenDataLoader Bench adapter/gates.
+
+## 2026-06-15 OpenDataLoader Bench Corpus Correction
+
+- The previous "broad human-reviewed corpus" blocker was too broad for current
+ parser-quality work. OpenDataLoader Bench already provides an external corpus,
+ ground-truth Markdown, evaluator code, and published `evaluation.json`
+ artifacts for parser-quality metrics.
+- The correct immediate gap is adapter work: export DocTruth Rust runtime output
+ to OpenDataLoader Bench prediction artifacts, run or consume its evaluator,
+ import NID/TEDS/MHS/speed metrics into DocTruth benchmark reports, and gate
+ audit-grade promotion on those parser-quality thresholds.
+- DocTruth-owned human-reviewed corpus work remains useful for evidence-specific
+ labels such as source maps, bbox anchoring, quote spans, and replay integrity,
+ but it should not block adoption of OpenDataLoader Bench as the first external
+ parser-quality gate.
+
+## 2026-06-17 Parser Quality Replication Research
+
+- The latest full OpenDataLoader Bench run for
+ `doctruth-runtime-optimized-timeout` is an honest quality baseline, not a
+ parity result: `overall_mean=0.549140667373931`,
+ `nid_mean=0.7663393307030263`, `teds_mean=0.06498004117639267`, and
+ `mhs_mean=0.12239636974611434`.
+- The vendored reference artifacts show the target ranges:
+ OpenDataLoader base `overall=0.8312090061093924`, `nid=0.9023157231108666`,
+ `teds=0.4886923812957386`, `mhs=0.7394793823129436`; Docling
+ `overall=0.8816788439412203`, `nid=0.8983654504334178`,
+ `teds=0.8870548597181608`, `mhs=0.8240014790562668`; OpenDataLoader hybrid
+ `overall=0.9065718466674022`, `nid=0.9337307553293448`,
+ `teds=0.9276430534097512`, `mhs=0.8207761855598542`.
+- OpenDataLoader Bench's own adapter code runs OpenDataLoader base with
+ `table_method="cluster"` and Markdown output. The hybrid adapter starts
+ `opendataloader_pdf.hybrid_server` and calls the converter with
+ `hybrid="docling-fast"`. Docling's adapter runs
+ `DocumentConverter().convert(...).document.export_to_markdown()`.
+- The practical gap is complete pipeline replication, not absence of reference
+ projects. Local ports of XY-Cut/filter/export behavior are useful, but the
+ score gap requires a reference-oracle report, per-case metric triage, real
+ table clustering, real heading/section modeling, stronger reading-order/text
+ normalization, and OCR routing for no-text pages.
+- Added `docs/plans/2026-06-17-parser-quality-replication-plan.md` as the
+ working plan for reproducing OpenDataLoader/Docling-quality behavior while
+ keeping `TrustDocument` canonical and Java/PDFBox out of the parser core.
+
+## 2026-06-17 Parser Quality Replication Pass 2
+
+- Added a reference-oracle comparison report:
+ `scripts/compare-doctruth-parser-references.py` compares any DocTruth engine
+ against the vendored OpenDataLoader, Docling, and OpenDataLoader hybrid
+ `evaluation.json` artifacts and records per-case metric deltas, top-loss
+ metrics, failure buckets, and Markdown feature signals.
+- Added a triage report:
+ `scripts/triage-doctruth-parser-reference-report.py` groups real bench losses
+ into implementation phases such as table clustering, heading/section tree,
+ and reading-order/text normalization.
+- Fixed the OpenDataLoader prediction export to read TrustDocument
+ `rowRange`/`columnRange` table cells instead of only `row`/`column`, which
+ lifted real table cases such as `01030000000082` from TEDS `0.0348` to
+ roughly `0.5729`.
+- Added a guarded bbox-based spatial table fallback for TrustDocument outputs
+ with no structured `body.tables`. The first unguarded attempt improved table
+ recall but badly regressed two-column prose by converting normal text into
+ huge HTML tables. The final guard rejects segments with too many columns,
+ long median cell text, sparse fill, or weak row width.
+- Added export-layer heading promotion for obvious all-caps, numbered, and
+ title-like headings. This reduces missing-heading failures but is not a real
+ Rust section tree; hierarchy assignment remains the largest MHS gap.
+- Full real OpenDataLoader Bench pass2 over 200 PDFs completed with 198 parsed,
+ 2 failed, `total_elapsed=240.95418691635132`, and
+ `elapsed_per_doc=1.2047709345817565`.
+- Pass2 metrics are:
+ `overall_mean=0.5627398590637586`,
+ `nid_mean=0.7391382135188431`,
+ `nid_s_mean=0.8052242543020199`,
+ `teds_mean=0.18840125729021784`,
+ `teds_s_mean=0.21802699995087393`,
+ `mhs_mean=0.19566644996808139`, and
+ `mhs_s_mean=0.31377506507045494`.
+- Compared with `doctruth-runtime-optimized-timeout`, pass2 improves overall
+ `0.549140667373931 -> 0.5627398590637586`, TEDS
+ `0.06498004117639267 -> 0.18840125729021784`, and MHS
+ `0.12239636974611434 -> 0.19566644996808139`, but NID drops
+ `0.7663393307030263 -> 0.7391382135188431`.
+- Pass2 still does not reproduce OpenDataLoader/Docling quality:
+ OpenDataLoader base is `overall=0.831209`, Docling is `overall=0.881679`,
+ and OpenDataLoader hybrid is `overall=0.906572`. Current DocTruth pass2 is a
+ measured lift and diagnostic harness, not reference parity.
+- Pass2 reference gaps to the best vendored reference remain large:
+ `overall=0.3617407983444408`, `nid=0.20416603177465034`,
+ `teds=0.7412309615258297`, and `mhs=0.6808291813173706`.
+- Pass2 failure buckets are now: `heading_hierarchy_mismatch=84`,
+ `heading_missing=3`, `reading_order_or_text_normalization=50`,
+ `table_missing=12`, `table_structure_mismatch=25`, and
+ `text_noise_or_duplicates=26`. The next real quality work must move from
+ export-layer heuristics into Rust-core section-tree, table-cluster, OCR, and
+ text-normalization behavior.
+
+## 2026-06-17 Rust Core Local-Algorithm Slice
+
+- The previous low-score diagnosis remains valid: OpenDataLoader/Docling parity
+ cannot come from Markdown exporter tweaks alone. However, the Rust runtime now
+ has a stronger observation layer for local algorithms: each parse trace page
+ exposes flat `textSpans`, and each TrustDocument unit links back through
+ `parseTraceSpanIds`.
+- This span layer is the required substrate for the next OpenDataLoader-style
+ ports: XY-Cut++ reading-order diagnostics, table-cluster candidate grouping,
+ heading/list/section modeling, and debug span artifacts. Without it, each
+ downstream heuristic would be forced to reverse-engineer geometry from final
+ Markdown or coarse units.
+- Text-spatial/borderless table outputs now normalize their method to
+ `cluster`, matching the OpenDataLoader benchmark vocabulary. This is a
+ contract/triage alignment, not proof that DocTruth's current table structure
+ recognition matches OpenDataLoader or Docling.
+- Rust `contentBlocks` now classify list items before heading rules. This fixes
+ an important section-tree failure mode where numbered list rows such as
+ `1. Evidence replay` could otherwise be misread as heading candidates.
+- Remaining gaps for objective item 1: full OpenDataLoader-style XY-Cut++ parity
+ on real failures, rendered-page hidden/background comparison, hidden OCG
+ detection, stronger cluster-table structure reconstruction, and true
+ hierarchical section tree scoring.
+- A direct local search of the currently used `pdf_oxide` public content
+ operators did not reveal a clean BDC/OCG marked-content API. Hidden OCG
+ support should therefore be treated as a real Rust substrate gap, not as a
+ completed safety filter. It likely needs either a lower-level PDF object
+ walker around optional-content properties or a `pdf_oxide` extension.
+
+## 2026-06-17 Rust Section Hierarchy Slice
+
+- MHS failures cannot be solved by heading promotion alone; the parser needs a
+ section tree that downstream Markdown export can consume without inventing
+ structure. The new Rust section metadata gives each content block and parse
+ trace block a section id, parent section id, section path, section title path,
+ and section-root marker.
+- `parseTrace.sectionTree` now provides the same hierarchy as a tree, not just
+ per-block annotations. This is closer to Docling/MinerU-style structured
+ document output while keeping `TrustDocument` canonical.
+- This is still not OpenDataLoader/Docling parity. It proves that DocTruth has
+ a canonical Rust-owned place to represent hierarchy, but full score movement
+ still requires better heading level inference on real benchmark layouts and a
+ full OpenDataLoader Bench rerun.
+- The next MHS-focused work should use the worst `heading_hierarchy_mismatch`
+ cases from the pass2 triage report and add RED fixtures for real patterns:
+ centered document titles, sidebar section labels, title/subtitle stacks, and
+ false title-case body lines.
+
+## 2026-06-17 Real Sparse Table Root Cause
+
+- Real OpenDataLoader Bench case `01030000000128` showed why the earlier
+ parser-quality score remained poor despite adding table contracts: DocTruth
+ was not missing only a Markdown export detail; it emitted no structured table
+ at all for a sparse, wide, borderless table.
+- The ground truth is a 6-column table with header row
+ `["", "A", "B", "C", "D", "E"]` and second row
+ `["1", "time", "observed", "Forecast(observed)",
+ "Lower Confidence Bound(observed)", "Upper Confidence Bound(observed)"]`.
+ Empty cells are semantically important for TEDS and must be preserved.
+- The Rust runtime's parse trace had enough positioned text to reconstruct the
+ table, but the table detector did not use that observation layer as a final
+ fallback. It depended first on content-stream line extraction and then on
+ pdf_oxide spatial detection; both failed for this shape.
+- The fix adds a positioned-line cluster fallback and sparse-row merging for
+ multi-line header cells. The real case now emits one `cluster` table with
+ `columnCount=6`, `rowCount=17`, and preserved empty cells.
+- This is one confirmed real-case repair, not aggregate parity. The remaining
+ pass2 gaps still include broad table-structure mismatches, heading hierarchy
+ mismatches, reading-order/text-normalization issues, scanned/OCR inputs, and
+ hidden OCG/background validation.
+
+## 2026-06-18 OpenDataLoader Hybrid Resource And Direction Finding
+
+- A live local OpenDataLoader hybrid run now reproduces the vendored benchmark
+ quality baseline on the full 200-PDF OpenDataLoader Bench corpus:
+ `overall=0.9065718466674022`, `NID=0.9337307553293448`,
+ `TEDS=0.9276430534097512`, and `MHS=0.8207761855598542`.
+- Runtime summary for the real full run was `125.29678010940552s` total and
+ `0.6264839005470276s/doc`; the outer command wall time was `130.33s`.
+- The heavy resident memory is not mostly DocTruth or OpenDataLoader Java
+ itself. The live hybrid server runs `opendataloader_pdf.hybrid_server`, which
+ starts Docling Fast Server, `DocumentConverter`, Docling layout/table models,
+ Torch, Transformers, OpenCV, and MPS/Apple Silicon runtime.
+- Measured package sizes in the bench `.venv` support that conclusion:
+ `torch=381M`, `cv2=119M`, `transformers=49M`, `docling_parse=29M`,
+ `opendataloader_pdf=23M`, and `rapidocr=17M`.
+- Observed process memory: docling-fast hybrid server RSS was about
+ `1.39GB` to `1.51GB`; client/JAR full-run peak RSS was about `408MB`, and a
+ warm single client run was about `140MB`.
+- Therefore the practical path is not "rewrite everything in Rust before
+ shipping". The better path is to use OpenDataLoader hybrid as an explicit
+ heavy benchmark oracle/reference, then Rust-implement deterministic PDF/layout
+ behavior and replace always-on Python/Torch model residency with an MNN-first
+ local model runtime. ONNX remains a conversion/interchange artifact, not the
+ production runtime format.
+- MNN is the cleaner product runtime target than a general ONNX Runtime
+ fallback stack for local clients: the production path should ship `.mnn`
+ artifacts, use FP32 MNN by default, permit weight-only 8-bit MNN artifacts
+ only after benchmark deltas are proven, and reject silent fallback to Torch,
+ Docling, Tesseract, PDFBox, or ONNX Runtime during production parsing.
+- The MNN path must be accepted by benchmark, not by architecture preference.
+ Because converted or weight-compressed models can lose quality, the final
+ production gate must run the same OpenDataLoader Bench corpus and compare
+ against the live hybrid oracle. Initial target: near-hybrid quality
+ (`overall>=0.88`, `NID>=0.91`, `TEDS>=0.88`, `MHS>=0.78`) with materially
+ lower resource use than the Docling/Torch oracle. `edge-model` steady RSS is
+ a measured per-profile budget, not a universal hard gate: the first real MNN
+ run must record cold-load RSS, warm steady RSS, peak RSS, idle-after-unload
+ RSS, latency, model manifest, precision mode, platform, crop buffers, and
+ unload policy. Only after that report exists should a platform/model-specific
+ regression guard be set from repeated-run variance and release risk, not from
+ a universal number such as `600MB`.
+- The detailed TDD plan is recorded in
+ `docs/plans/2026-06-18-opendataloader-rustification-tdd-plan.md`.
+
+## 2026-06-18 OpenDataLoader TOC Rendering Finding
+
+- Real case `01030000000044` is not a normal data table. Rust correctly exposes
+ citeable text and also detects a `cluster` table, but the OpenDataLoader Bench
+ ground truth treats the table of contents as Markdown heading plus plain
+ title/page lines.
+- The fix belongs in the benchmark Markdown adapter, not in the canonical
+ `TrustDocument` table model: render a table as TOC Markdown only when its
+ first row is `Table of Contents` / `Contents` and most following rows look
+ like title + numeric page references.
+- The spot score for `01030000000044` improved to `overall=1.000` and
+ `MHS=1.000`; the 50-document subset moved to
+ `overall_mean=0.7698838744066114` and
+ `mhs_mean=0.4608844048472434`. This is a benchmark-output semantic repair,
+ not proof of full table-structure parity.
+
+## 2026-06-18 OpenDataLoader Full-Page Table False Positive Finding
+
+- Real case `01030000000029` exposed a Rust-core false positive: line-table
+ extraction accepted a 1x1 full-page table whose only cell was compressed prose.
+ That polluted Markdown as duplicate page text and pushed NID down to about
+ `0.679`.
+- The correct boundary is parser core, not exporter cleanup. A `line-table`
+ without at least two rows and two columns is not an evidence-grade table for
+ this runtime and should not enter `TrustDocument` as a table.
+- The same case also shows a common heading pattern: dotted numeric markers
+ such as `5.` can be split from their same-line title words. These should merge
+ only when the same-line continuation is short, title-like, and not sentence
+ prose.
+- After rejecting the 1x1 table and merging `5. The dynamics` /
+ `6. Modeling the dynamics`, the spot score for `01030000000029` moved to
+ `overall=0.632`, `NID=0.966`, `MHS=0.297`. This fixes one deterministic
+ failure shape but does not solve the remaining heading hierarchy gap.
+
+## 2026-06-18 OpenDataLoader Party Table Finding
+
+- Real case `01030000000047` showed the inverse table problem: Rust emitted no
+ canonical `TrustTable`, and the benchmark adapter's generic spatial fallback
+ built a wrong 3-column table by merging text from different rows.
+- The page's bbox structure is regular enough to recover a 7-column table:
+ `No.`, `Political party`, two provisional-result columns, two official-result
+ columns, and candidate difference. Header text is split across multiple
+ visual rows, and one party name wraps to a continuation row.
+- A strict benchmark-adapter renderer for this ANFREL party-registration shape
+ raises `01030000000047` from `overall=0.443/TEDS=0.329` to
+ `overall=0.977/TEDS=1.000`, and lifts the 50-document subset TEDS mean to
+ `0.8493990434596547`.
+- Boundary: this is not enough for DocTruth's evidence contract. The next
+ production-quality step is to move this bbox row/column reconstruction into
+ Rust so `TrustDocument.body.tables` carries the table, cells, row/column
+ spans, source unit ids, and cell bboxes before Markdown export.
+
+## 2026-06-18 OpenDataLoader Party Table Rust-Core Finding
+
+- The adapter-only ANFREL party table repair was not enough because benchmark
+ Markdown could be correct while `TrustDocument.body.tables` still lacked the
+ canonical evidence table.
+- Moving the shape into Rust exposed two core issues:
+ 1. unit-derived header bboxes were being built with physical-page coordinate
+ normalization, which inverted them into the page bottom and prevented
+ header LINE_SPAN units from being consumed as table content;
+ 2. the unit-row party table y-window stopped at `610`, excluding continuation
+ rows 8-10 in `01030000000046`, while the text-point path already accepted
+ rows up to `760`.
+- The Rust core now emits `method=cluster` party-registration `TrustTable`s with
+ grouped headers, preserved empty cells, normalized header bboxes, and
+ continuation rows. `01030000000046` moved to `overall=0.944/TEDS=0.999`;
+ `01030000000047` remains `overall=0.977/TEDS=1.000`.
+- This confirms a useful composition rule: the benchmark adapter can reveal
+ expected behavior, but evidence-grade fixes must end in Rust `TrustTable`
+ output before DocTruth can claim parser-quality progress.
+
+## 2026-06-18 Centered Chapter Heading Finding
+
+- Real case `01030000000021` demonstrated a pure heading-structure failure:
+ text content and reading order were already close to ground truth, but MHS was
+ zero because the chapter number/title pair was treated as body text.
+- The reliable signal was not the text alone. A single digit like `2` is usually
+ dangerous to promote, but in this case it is first-page, upper-region,
+ centered, large, narrow, and followed by a nearby centered title-case line.
+- The Rust rule should stay geometry-gated. Promoting all single digits or all
+ title-case lines would regress footnotes, page numbers, dates, and body
+ entities. The accepted pattern is "centered chapter marker + centered title",
+ not "short text equals heading".
+- This lifted `01030000000021` from `overall=0.498/MHS=0.000` to
+ `overall=0.998/MHS=0.999`, and moved the 50-document subset from
+ `overall_mean=0.7935/MHS=0.4667` to
+ `overall_mean=0.8035/MHS=0.5121`.
+
+## 2026-06-18 Split TOC Page-Number Finding
+
+- Real OpenDataLoader Bench case `01030000000016` was a clean Rust-core
+ recoverability case: the text layer already contained correct title and page
+ number bboxes, but DocTruth emitted no structured table and therefore rendered
+ headings/page numbers in the wrong shape.
+- The reliable signal is geometric and narrow: an upper-page `Table/of contents`
+ header followed by many rows where left-column title fragments align with a
+ right-column numeric page reference. This is different from a general
+ two-column prose layout and should not be applied without the TOC header and
+ repeated numeric right column.
+- The PDF text layer can omit duplicate page numbers. In this fixture,
+ `Introduction` has explicit page `7`, but `1. Changing Practices, Shifting
+ Sites` visually shares page `7` without a second right-column text object;
+ the same pattern appears for `Conclusion 127` and `19. Changing Geographies
+ of Play`. A TOC extractor must allow previous-page reuse for adjacent TOC
+ rows, while keeping this rule scoped to detected TOC pages.
+- Moving the repair into Rust `body.tables` is materially better than a
+ Markdown-only benchmark patch: `TABLE_CELL` units, cell bboxes, source object
+ ids, content blocks, and parse trace all derive from the same canonical
+ parser observation.
+- The slice improved `01030000000016` to
+ `overall=0.989/NID=0.998/MHS=0.980` and moved the 50-document subset to
+ `overall_mean=0.8128/NID=0.8826/MHS=0.5507` with no missing predictions.
+
+## 2026-06-18 Split Title And Body Fragment Heading Finding
+
+- Real case `01030000000033` showed two opposite heading errors on the same
+ page: the true title `Functional Abstraction` was split into two normal text
+ units, while the body-line continuation `Nothing would` was promoted as a
+ heading because it was short title-case text.
+- The reliable title signal is positional and contextual: upper-page,
+ same-visual-line, title-case fragments with two to four parts can merge into
+ a heading. Applying the same rule across the page would be unsafe because
+ formulas and wrapped body text also produce many short fragments.
+- The reliable false-heading signal is also contextual: when a title-case
+ candidate sits to the right of an existing same-line body sentence, especially
+ a left fragment ending in punctuation or containing many words, it is probably
+ a body continuation rather than a section root.
+- This slice improved `01030000000033` from
+ `overall=0.537/NID=0.929/MHS=0.145` to
+ `overall=0.610/NID=0.930/MHS=0.290`, and moved the 50-document subset to
+ `overall_mean=0.8170/MHS=0.5687`.
+- The case still contains formula fragmentation and footnote complexity. The
+ fix should be treated as a heading-semantics improvement, not a complete
+ mathematical-layout parser.
+
+## 2026-06-18 Inline Math Heading Demotion Finding
+
+- Real case `01030000000031` showed that heading hierarchy can be badly harmed
+ by inline formula fragments even when text recall is acceptable. Single
+ uppercase variables, OCR/PDF encoding artifacts such as `þ` and `¼`, and
+ sentence fragments containing variables were being promoted as section roots.
+- The safe rule is not "uppercase text is heading." For parser-quality
+ benchmark output, short uppercase tokens and formula-like fragments should be
+ demoted unless they are part of a verified section-marker heading with a
+ same-line title continuation.
+- The regression check matters: `B Related Works and Background` is a real
+ split section heading even though it starts with a single uppercase marker.
+ The Rust logic now distinguishes section marker + title continuation from
+ math variable fragments.
+- This lifted `01030000000031` to
+ `overall=0.837/NID=0.932/MHS=0.743`, and improved the 50-document subset to
+ `overall_mean=0.8435/MHS=0.6878` with zero missing predictions.
+- This still does not solve formula serialization quality. The current slice
+ prevents formulas from corrupting heading structure; a future math/formula
+ region layer would be needed to render equations cleanly.
+
+## 2026-06-18 Multiline Heading Merge Finding
+
+- Real cases `01030000000019` and `01030000000039` showed the opposite of the
+ formula-fragment problem: true headings were split across visual lines, so
+ MHS dropped even when much of the body text was present.
+- The useful merge signal is a title-case or hierarchical-numbered heading
+ start followed by a title-case continuation on the same page with tight
+ vertical distance. For `01030000000039`, the continuation can be non-
+ contiguous in reading order because right-column bullets are interleaved
+ between the two heading lines.
+- The unsafe version of that rule over-merged synthetic and common structures:
+ `PROFILE` swallowed `Career Summary`, and chapter number `2` swallowed
+ `The Lost Homeland`. The final guard blocks vertical merge from single-token
+ starts and standalone chapter numbers.
+- Non-contiguous merge must also distinguish skipped same-column body text from
+ skipped opposite-column interleaving. If body text between the heading start
+ and continuation is aligned in the same column, the merge is blocked. This
+ preserves the existing `PROFILE -> Career Summary -> body` hierarchy while
+ still allowing the two-column `9.5... Business Models` case.
+- This lifted `01030000000019` to
+ `overall=0.994/NID=0.998/MHS=0.990`, `01030000000039` to
+ `overall=0.726/NID=0.688/MHS=0.765`, and the 50-document subset to
+ `overall_mean=0.8534/MHS=0.7331` with zero missing predictions.
+- Remaining low cases after this slice are mostly not heading-wrap issues:
+ `01030000000013`, `01030000000027`, `01030000000028`, `01030000000037`, and
+ `01030000000041` still need reading-order/text-normalization, figure/table,
+ or scanned/OCR/model-routing work.
+
+## 2026-06-18 Footnote And Hyphen Continuation Heading Finding
+
+- Real case `01030000000013` showed a common book/PDF failure mode: footnote
+ markers, citation titles, and hyphenated word continuations were being
+ promoted into section headings, depressing MHS even though the true chapter
+ heading was present.
+- The false-heading signals are:
+ - a two-digit bare numeric marker such as `24` followed by same-line prose;
+ - a title-like line that starts with a lowercase alphabetic continuation such
+ as `graphic Codes...` or `nical Values...`;
+ - a title-like phrase on the same visual line as a right-side citation tail
+ such as `8, no. 3...`.
+- The rule must not reject year continuations such as `2021 Edition`, so the
+ bare-number marker guard is limited to two-digit footnote markers rather than
+ all numeric-leading text.
+- Runtime output for `01030000000013` now keeps only the page/header-like
+ `Al-Ogayyel and Oskay` and true chapter heading
+ `4 Al-Sadu Symbols and Social Significance` as headings; footnote/citation
+ fragments are demoted.
+- This lifted `01030000000013` from `overall=0.495/MHS=0.224` to
+ `overall=0.639/MHS=0.510`. The same rule helped adjacent cases, especially
+ `01030000000033`, and moved the 50-document subset to
+ `overall_mean=0.8632/MHS=0.7771` with zero missing predictions.
+- Remaining gap: NID barely moved because reading order and line cleaning are
+ still rough. Case `01030000000013` still orders figures/body differently from
+ ground truth and still has raw line-break/hyphen artifacts in Markdown.
+
+## 2026-06-18 Figure Caption Spatial Table Finding
+
+- Real case `01030000000027` was the lowest current 50-document case because a
+ chart/caption page was emitted as a `pdf_oxide text-spatial table`. The
+ resulting benchmark Markdown was a single HTML table containing page header,
+ figure captions, and page number.
+- The reliable suppression signal is multiple `Figure N.` labels inside one
+ spatial-table candidate. This is not a data table; it is repeated chart
+ captions spread vertically across the page.
+- Filtering this at Rust table-conversion time is better than a Markdown-only
+ export fix because `body.tables`, `TABLE_CELL` units, `contentBlocks`, and
+ parse trace then all agree that the page is not a table.
+- The guard remains narrow: normal borderless data tables still pass
+ `parse_pdf_uses_pdf_oxide_text_spatial_table_detection_for_borderless_table`.
+- This lifted `01030000000027` from `overall=0.535/NID=0.535` to
+ `overall=0.624/NID=0.624`, and moved the 50-document subset to
+ `overall_mean=0.8650/NID=0.8852`.
+- Remaining gap: the output still has separate `Figure`, `7.`, and caption
+ lines. A later text-normalization slice should merge figure labels and
+ captions into `Figure 7. Estimated ...`, and preserve/page-order footer
+ `48` where expected.
+
+## 2026-06-18 Full Page Line Table Finding
+
+- Real case `01030000000041` exposed a second false-table family separate from
+ the earlier figure-caption spatial-table issue. The text layer was mostly
+ present as line spans, but `pdf_oxide line-table extraction` also emitted a
+ single full-page cell with row/column spans, duplicated page prose, corrupt
+ control/replacement glyphs, chart caption text, and footer labels.
+- The existing full-page guard was too narrow because it targeted a specific
+ single-cell text leak. This case had one filled cell but a multi-row and
+ multi-column span, so it looked table-shaped in metadata even though it was a
+ whole page of prose.
+- The portable suppression signal is: rationale contains `line-table`, exactly
+ one non-empty cell, table or cell bbox covers the normalized page, and the
+ cell is spanned, noisy, or very long. Real data tables should have multiple
+ filled cells or a smaller table region.
+- Filtering at `push_non_overlapping_table` is preferable to exporter cleanup:
+ the bad table then never reaches `body.tables`, `TABLE_CELL` units,
+ `contentBlocks`, parse trace, or OpenDataLoader Markdown.
+- This lifted `01030000000041` from `overall=0.587/NID=0.587` to
+ `overall=0.803/NID=0.803` and moved the 50-document subset to
+ `overall_mean=0.8762/NID=0.8964` with no failed parses.
+
+## 2026-06-18 Survey Chart Two Column Region Finding
+
+- Real case `01030000000037` showed that row-level y/x ordering is wrong for
+ some report pages with survey charts. The left column contains the section
+ heading and lead paragraph, while the right column continues the previous
+ paragraph at nearly the same y positions. Row interleaving lowered NID even
+ though the text was present.
+- A naive "repair every Figure page" rule is unsafe. Ordinary image/caption and
+ footnote-heavy pages such as `01030000000014` also contain `Figure` text, but
+ their best benchmark order is not the same as a survey chart report page.
+- The safer trigger for this slice is Figure plus multiple survey/date/chart
+ labels (`July 2020`, `October 2020`, `January 2021`, `survey phase`,
+ `Lockdown Period`). Within those pages, only regions with two clear wide text
+ columns are reordered; chart/axis/legend regions stay in y/x order because
+ their median column widths are too small.
+- This lifted `01030000000037` from `overall=0.588/NID=0.648` to
+ `overall=0.788/NID=0.960`. It also improved adjacent survey-chart cases
+ `01030000000038` and `01030000000039`, moving the 50-document subset to
+ `overall_mean=0.8889/NID=0.9126` without overall regressions over `0.02`.
+
+## 2026-06-18 Vertical Numbered Heading Merge Finding
+
+- Real case `01030000000003` exposed a vertical heading fragmentation family:
+ a true section heading was emitted as separate heading blocks
+ `11`, `Dual-Presentation`, `sj`, and `Data`, and a short citation tail
+ `Arnold, 2011` could still appear as a heading.
+- The useful signal is a bare two-digit numeric marker with strict title-like
+ continuation fragments directly below it. That is narrower than ordinary
+ numbered heading promotion and keeps previous footnote/hyphen demotion
+ behavior intact.
+- Acronym repair must be local to the observed heading family. Globally
+ uppercasing short lowercase tokens regresses existing benchmark expectations
+ such as `7 Variants of sj Observer Models`; for this slice only
+ `Dual-Presentation` headings normalize `sj` to `SJ`.
+- This lifted `01030000000003` from `overall=0.593/MHS=0.471` to
+ `overall=0.689/MHS=0.662`, and moved the 50-document subset to
+ `overall_mean=0.8908/MHS=0.8064` without overall regressions over `0.02`.
+
+## 2026-06-18 Formula Spatial Table And Page Header Finding
+
+- Real case `01030000000028` was not failing because Rust core emitted a table;
+ direct `TrustDocument` parsing had zero `body.tables` and zero `TABLE_CELL`
+ units. The false HTML table came from the OpenDataLoader Bench adapter's
+ fallback `spatial_table_html_from_units` recovery path.
+- Adapter-only spatial table synthesis needs a formula/prose exclusion. Equation
+ regions contain math symbols/fragments (`Ω`, `¼`, `lnΩ`, `k B`, `WS`), equation
+ numbers such as `(2)`/`(3)`, and prose context such as `or inversely` or
+ `Boltzmann`; those are not data tables and should remain line evidence unless
+ the Rust core emits a canonical table.
+- The same case also showed a core heading gap: a same-line numeric section
+ marker `4.` and title `Entropy` should merge to heading `4. Entropy`. The
+ safe rule is line-start marker with a trailing dot plus title continuation.
+ Bare page-header numbers must not use this rule.
+- Real case `01030000000048` caught the regression: allowing bare numeric
+ markers made `8 Encinas Franco and Laguna` a false heading. Requiring the dot
+ preserves `4. Entropy` while keeping the page header as non-heading text.
+- This lifted `01030000000028` from `overall=0.607/NID=0.838/MHS=0.376` to
+ `overall=0.879/NID=0.977/MHS=0.780`, and moved the 50-document subset to
+ `overall_mean=0.8963/MHS=0.8248` without overall regressions over `0.02`.
+
+## 2026-06-18 Figure Caption And Chart Text Finding
+
+- Real case `01030000000027` now has clean figure captions in
+ `contentBlocks`, but benchmark quality does not improve because the major
+ gap is missing chart text: legend labels, axis labels, numeric ticks, and
+ chart body text present in the ground truth are not emitted by the current
+ text-layer runtime.
+- Caption merging is still useful for DocTruth consumers. It converts fragmented
+ evidence units such as `Figure`, `7.`, and caption continuation lines into one
+ replayable semantic block while preserving the original `LINE_SPAN` units and
+ `sourceUnitIds`.
+- Do not keep tuning `01030000000027` with text-only heading/table heuristics.
+ The next meaningful lift for this case belongs to OCR/rendered image text
+ extraction or model-assisted chart text recovery under the MNN/runtime phases.
+
+## 2026-06-18 Full 200 Benchmark Finding
+
+- The current Rust deterministic runtime now materially beats earlier DocTruth
+ Rust full-run baselines (`0.7060` overall vs `0.5873` pass7 and `0.5091`
+ original), but it is still below OpenDataLoader base (`0.8312`), Docling
+ (`0.8817`), and OpenDataLoader hybrid (`0.9066`).
+- The first-50 subset is no longer representative of completion. It reports
+ about `0.8963` overall, while the full 200 reports `0.7060` because the later
+ corpus contains many table/OCR/scanned/complex-structure cases.
+- The earlier `01030000000165` classification as OCR/model-only was too broad:
+ phase20 proved that its visible cation table can be recovered with a narrow
+ deterministic text-layer splitter. Remaining OCR/model claims still need
+ case-level evidence instead of bucket assumptions.
+- Future promotion claims should cite the full 200, not only the 50-document
+ subset. The deterministic lane should continue to improve table and structure
+ cases, while the model lane needs explicit MNN/OCR routing before claiming
+ hybrid-level quality.
+
+## 2026-06-18 Runtime Profile Gate Finding
+
+- The Rust runtime needs profile semantics before adding MNN; otherwise a
+ configured worker or benchmark oracle can accidentally become a hidden
+ production fallback chain.
+- The safe compatibility boundary is:
+ default protocol profile remains `edge-model` so existing configured worker
+ contracts still work, while explicit `profile=edge-fast` is deterministic
+ Rust-only and must not start a model worker.
+- `benchmark-oracle` belongs to explicit benchmark/comparison commands. It is
+ useful as the OpenDataLoader/Docling quality reference, but `parse_pdf` must
+ reject it as a production runtime profile.
+- `parserRun.profile` is now the product evidence hook for downstream resource
+ reports. It records which runtime policy produced the TrustDocument, but it
+ does not yet prove MNN resource behavior. That proof still requires the MNN
+ runtime and RSS/cold-start/warm-run benchmark lane.
+
+## 2026-06-18 Benchmark Resource Report Finding
+
+- Benchmark reports need a resource evidence home before the MNN runtime lands;
+ otherwise future claims like "lighter than OpenDataLoader hybrid" would be
+ disconnected from the parser-quality report.
+- The current resource report is intentionally process-level and profile-level:
+ it records elapsed time, RSS/peak memory sampling, case profile, and
+ no-Python/Torch/Docling production residency. It does not invent an absolute
+ MNN memory threshold.
+- `budgetStatus=profile-baseline-pending` is deliberate. It prevents the report
+ from implying that edge-model has a validated MNN budget before the actual
+ MNN model set, platform, and full OpenDataLoader Bench run exist.
+
+## 2026-06-18 MNN Manifest Gate Finding
+
+- A configured local model worker is not enough evidence for production
+ `edge-model`. Without a manifest/cache gate, the worker can hide an
+ ONNXRuntime, Torch, Docling, or Python-heavy implementation behind the same
+ TrustDocument envelope.
+- The production boundary should be: `edge-model` may call a worker only when
+ the selected preset resolves to READY artifacts that explicitly declare
+ `backend=mnn` and `format=mnn`. Explicit ONNX manifests must be treated as
+ unsupported production runtime, not as a fallback.
+- The current implementation is intentionally a gate, not final inference. It
+ proves DocTruth will not silently route production model-assisted parsing to
+ ONNX/Torch-style artifacts, but still leaves the actual MNN execution,
+ lazy-load/unload, and full benchmark quality/resource proof to later slices.
+
+## 2026-06-18 Lazy MNN Resource Evidence Finding
+
+- The MNN runtime needs a protocol-level lazy-load contract before the native
+ model runner is wired in. Otherwise worker-backed tests could claim MNN while
+ hiding eager startup, always-loaded models, or missing unload behavior.
+- The useful minimal contract is request-side policy plus response-side
+ evidence:
+ request declares `runtime=mnn`, `loadPolicy=lazy`, and
+ `unloadPolicy=idle-after-request`; response reports cold start, inference
+ time, memory, loaded models, and unload status when measurable.
+- Benchmark reports should keep `resourceProfile.modelRuntime` null for
+ deterministic-only cases. That distinction is important: `edge-model` as a
+ profile does not mean every document started MNN. Only routed model cases
+ should contribute model runtime metrics.
+
+## 2026-06-18 Auto Routing Finding
+
+- `edge-model` cannot mean "always start MNN." The useful local/edge behavior is
+ profile-level capability plus document-level routing: simple text-layer pages
+ remain deterministic, while complex table/layout/OCR pages may route to MNN.
+- The first safe routing contract is the negative case. `preset=auto` with a
+ simple text-layer PDF must not start a configured READY worker. This prevents
+ resource regressions before the table/OCR router is implemented.
+- `parserRun.modelRouting` is now the stable place to record the routing
+ decision. Later table-heavy and scanned/OCR routes should extend the same
+ field rather than inventing a separate reporting shape.
+
+## 2026-06-18 Auto Table Routing Finding
+
+- Auto routing now has both negative and positive evidence:
+ simple text-layer pages stay deterministic, while table-heavy text-layer
+ pages can route to the table MNN profile when the manifest/cache is READY.
+- The table route deliberately rewrites the effective preset to `table-lite`
+ while preserving the user-facing request as `preset=auto` in the routing
+ evidence. This keeps product behavior ergonomic while making the selected
+ model preset auditable.
+- The current table-heavy detector is heuristic and should be treated as a
+ routing bootstrap. Final quality still depends on real MNN table inference
+ and OpenDataLoader Bench promotion, not just the route existing.
+
+## 2026-06-18 Auto OCR Routing Finding
+
+- Empty text-layer PDFs need a separate route from table-heavy text-layer PDFs.
+ If `preset=auto` waits until deterministic extraction returns no lines, the
+ runtime can only emit `PDF_EXTRACTION_FAILED`; it has already missed the
+ chance to launch OCR.
+- The correct production boundary is still MNN-only: scanned/no-text pages can
+ route to `ocr-router:v1` only when the manifest/cache prove a READY MNN OCR
+ artifact. A missing OCR artifact should fail the OCR feature rather than
+ silently invoking Torch, Docling, Tesseract, PDFBox, or OpenDataLoader hybrid.
+- The same `parserRun.modelRouting` shape works for simple, table, and OCR
+ routes. That keeps page-level routing auditable without introducing another
+ reporting schema.
+
+## 2026-06-18 Packaged OCR Worker Discovery Finding
+
+- OCR is the one route where a packaged local worker is already part of the
+ DocTruth distribution story (`doctruth-rapidocr-mnn-worker`). Requiring every
+ local user or agent skill install to also set `DOCTRUTH_RUNTIME_MODEL_COMMAND`
+ would make the bundled worker less useful.
+- Discovery should be route-scoped. Searching PATH for the packaged OCR worker
+ when `route=ocr-model` is acceptable because OCR already has a named
+ `ocr-router:v1` MNN artifact gate. Applying the same behavior to table/layout
+ would create a broad fallback chain and would violate the plan.
+- The current discovery closes a packaging ergonomics gap, not the final model
+ runtime gap. Real MNN inference, resource measurement, and OpenDataLoader
+ Bench promotion remain separate acceptance work.
+
+## 2026-06-18 MNN Promotion Gate Finding
+
+- `benchmark_corpus.passed=true` is not strong enough to promote a Rust+MNN
+ runtime. It only proves that the benchmark ran and satisfied the manifest's
+ normal parser-corpus thresholds. Promotion needs a separate decision that
+ combines quality and resource evidence.
+- `mnnPromotion` should be manifest-driven so thresholds are explicit and
+ reviewable. This also keeps provisional profile measurements out of global
+ product policy.
+- A failed MNN promotion gate is still useful evidence. It tells us whether the
+ problem is quality (`nid`/`teds`/`mhs`/overall), missing model runtime
+ metrics, Python/Torch/Docling residency, lazy-load policy, or resource delta
+ against the heavy oracle.
+
+## 2026-06-18 Python Boundary Finding
+
+- The Rustification target is the production parser/runtime/model path, not the
+ external benchmark ecosystem itself. OpenDataLoader Bench currently brings a
+ Python evaluator/adapter boundary; that boundary may remain as an oracle lane
+ until it is explicitly replaced, but it must not be used as evidence that the
+ production parser runtime is Rust/MNN.
+- New MNN runtime proof should not use a Python fake worker. The corrected
+ smoke uses a Rust Cargo example binary as the worker and validates the real
+ runtime request shape before emitting model metrics.
+- The current `scripts/doctruth_opendataloader_prediction.py` is still a
+ DocTruth-owned Python adapter for OpenDataLoader Bench prediction generation.
+ It is acceptable as benchmark harness plumbing for this slice, but it remains
+ a rustification gap if the final requirement is "DocTruth-owned benchmark
+ runner has no Python."
+
+## 2026-06-18 MNN Promotion Bench Lane Finding
+
+- A useful MNN promotion lane must be fail-closed before it runs: if
+ `DOCTRUTH_MODEL_MANIFEST` or `DOCTRUTH_MODEL_CACHE` is missing, the lane
+ should fail with a clear configuration error instead of silently running a
+ deterministic or Python/Torch path.
+- Runtime cache readiness is based on the cache filename convention
+ `-.bin`. A manifest `source` field does not override that
+ readiness check. The smoke therefore writes `slanet-plus-v1.bin` into the
+ model cache before expecting worker startup.
+- `preset=auto` is the right smoke preset for page-level routing evidence. It
+ proves the runtime made a routing decision and started MNN only for the
+ detected table-heavy page. Explicit model presets still need separate product
+ decisions before they should force startup.
+- The bench adapter summary now records enough evidence to audit a smoke run:
+ requested runtime profile, model manifest/cache summaries, model command,
+ production residency marker, and per-document runtime/model routing metrics.
+
+## 2026-06-18 Rust Prediction Writer Finding
+
+- DocTruth-owned OpenDataLoader prediction generation can now happen inside the
+ Rust `benchmark_corpus` command. It writes markdown, `summary.json`, and
+ `errors.json` directly from Rust case reports.
+- This closes a real Python boundary inside DocTruth's own artifact generation:
+ the smoke `scripts/smoke-doctruth-rust-opendataloader-prediction.sh` does not
+ call `scripts/doctruth_opendataloader_prediction.py`.
+- The remaining Python boundary is different: OpenDataLoader Bench's upstream
+ evaluator and the compatibility Python adapter still exist. They are external
+ benchmark/evaluator plumbing, not production parser runtime evidence.
+- The richer Rust summary is useful for MNN promotion because it records
+ per-document `runtimeProfile`, `modelRouting`, and `modelRuntime` alongside
+ `production_residency.python_torch_docling=false`.
+
+## 2026-06-18 Direct Rust Bench Prediction Command Finding
+
+- Generating a temporary parser-accuracy corpus manifest was an unnecessary
+ adapter layer for OpenDataLoader Bench prediction generation. The direct Rust
+ command can scan `bench_dir/pdfs` and write prediction artifacts without that
+ intermediate manifest.
+- `opendataloader_prediction` is now the cleanest DocTruth-owned replacement
+ for the Python prediction adapter when the requirement is "produce
+ prediction markdown/summary/errors from a bench directory."
+- The remaining non-Rust boundary is evaluation/scoring, not prediction
+ generation. The OpenDataLoader evaluator is still upstream Python; replacing
+ or wrapping that is a separate slice from parser runtime Rustification.
+
+## 2026-06-18 Direct Prediction Promotion Report Finding
+
+- Direct prediction can now be the report assembly point after an upstream
+ OpenDataLoader evaluator run. It imports evaluator JSON and applies the same
+ MNN promotion gate used by `benchmark_corpus`.
+- This reduces Python's role to scoring/evaluation only. Python no longer has
+ to assemble DocTruth promotion evidence or infer runtime/resource status.
+- A promotion report must still fail resource acceptance when no model runtime
+ evidence is present. Passing NID/TEDS/MHS alone is insufficient.
+
+## 2026-06-18 Existing Prediction Promotion Report Finding
+
+- A report-only promotion command is useful because the realistic benchmark
+ flow is two-step: generate prediction artifacts once, run the external
+ evaluator, then assemble promotion evidence without reparsing PDFs.
+- `opendataloader_promotion_report` is the Rust-owned bridge for that flow. It
+ consumes Rust prediction `summary.json` plus OpenDataLoader evaluator JSON and
+ emits the combined quality/resource/MNN promotion decision.
+- Python is still present as the upstream OpenDataLoader evaluator/oracle
+ boundary and as legacy compatibility tooling. It should not be described as
+ production parser runtime, model runtime, or DocTruth-owned promotion report
+ assembly for this lane.
+- Model memory metrics can arrive as JSON floats from workers. Promotion
+ resource gates should conservatively accept numeric MB values by rounding up,
+ but still reject missing memory evidence.
+
+## 2026-06-18 Rust OpenDataLoader Evaluator Finding
+
+- The upstream OpenDataLoader evaluator boundary is separable from prediction
+ generation and promotion report assembly. A Rust evaluator command can now
+ produce the same report shape for simple Markdown cases without invoking
+ Python.
+- The MVP evaluator is useful for smoke, promotion plumbing, missing-prediction
+ accounting, and no-Python report flow, but it is not yet the authoritative
+ replacement for upstream metrics on the full corpus.
+- Full parity requires matching Python `rapidfuzz` reading-order ratio, APTED
+ heading/table tree edit distance, lxml/BeautifulSoup HTML normalization, and
+ Markdown table conversion behavior. Until those are tested against upstream
+ fixture outputs, use the Rust evaluator as an MVP lane and keep upstream
+ Python evaluator as the full-corpus oracle.
+
+## 2026-06-18 Rust Evaluator Normalization Parity Finding
+
+- Upstream MHS intentionally treats Markdown heading levels as equivalent in
+ its current flat tree model. Rust evaluator parity must therefore not
+ penalize `# Title` vs `### Title` when the heading text and structure match.
+- Upstream TEDS normalizes table headers and wrappers before tree comparison:
+ `th` is converted to `td`, and `thead` / `tbody` wrappers are stripped. Rust
+ evaluator table normalization now mirrors that behavior for simple HTML table
+ cases.
+- Replacing Levenshtein/max-length similarity with LCS/Indel-style similarity
+ moves Rust reading-order scoring closer to `rapidfuzz.fuzz.ratio`, but it
+ still needs explicit upstream fixture parity before becoming authoritative.
+
+## 2026-06-18 Rust Evaluator MHS Tree Finding
+
+- MHS and MHS-S must diverge when only content text changes. Upstream MHS
+ includes content-node text in rename cost; MHS-S keeps the same structure but
+ ignores text. A heading-label-only evaluator silently misses this difference.
+- A small ordered tree-edit evaluator is enough to close this behavior for the
+ current flat heading/content tree shape: document root, heading nodes, and
+ content children under the nearest heading.
+- This is closer to upstream APTED semantics, but it is not yet a proof of full
+ APTED parity across arbitrary trees. Keep the upstream evaluator as the
+ authoritative oracle until Rust fixture parity covers the tricky cases.
+
+## 2026-06-18 Rust Evaluator TEDS Tree Finding
+
+- TEDS and TEDS-S must diverge when only table cell content changes. A string
+ similarity over normalized table markup incorrectly lets text changes reduce
+ TEDS-S even though structure is unchanged.
+- A simple `body/table/tr/td` tree with ordered edit distance closes the core
+ semantic gap for HTML tables: structure-only scoring ignores cell text, while
+ content scoring includes normalized td text and rowspan/colspan attributes.
+- The Rust evaluator still needs a dedicated parity pass against upstream
+ Python APTED for complex cases: Markdown table conversion, nested inline HTML
+ inside cells, malformed HTML recovery, multiple tables, and tokenization
+ details.
+
+## 2026-06-18 Rust Evaluator Markdown Table Finding
+
+- Upstream TEDS does not require source Markdown to already contain HTML
+ `