From aef78ef9884693c9858e59fa407a51f99ebaf04c Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Mon, 13 Apr 2026 15:11:07 +0200
Subject: [PATCH 1/6] Scaffold Antora site (#494 step 1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the minimum Antora config needed to produce a one-page
static site:

- docs/antora-playbook.yml      — site playbook, asciidoctor-kroki
                                    extension, default Antora UI
                                    bundle, output at docs/build/site
- docs/antora.yml               — component `skainet`, version ~ (HEAD)
- docs/modules/ROOT/nav.adoc    — minimal Divio navigation scaffold
                                    (sections empty; pages land in
                                    commit 2)
- docs/modules/ROOT/pages/index.adoc — landing page with Diataxis
                                    section intros and an
                                    admonition pointing LLM readers
                                    at the SKaiNET-transformers
                                    sibling repo
- docs/.docker/Dockerfile       — verbatim copy from
                                    SKaiNET-transformers: node:20-alpine
                                    + Chromium + font-noto +
                                    @antora/cli@3.1 +
                                    @antora/site-generator@3.1 +
                                    asciidoctor-kroki@0.18 +
                                    @mermaid-js/mermaid-cli@11.
                                    Built locally as
                                    skainet-antora:local, never
                                    pushed to any registry.

Verified locally:

    docker build -t skainet-antora:local docs/.docker
    docker run --rm -v "$PWD:/antora" -w /antora \
        skainet-antora:local --stacktrace docs/antora-playbook.yml

Exit 0, zero warnings, produces docs/build/site/skainet/index.html.

The navigation sections are intentionally stubbed for this commit —
content migration to the Divio layout (Tutorials / How-to / Reference
/ Explanation) is commit 2. Dokka bundling into a sibling /api/ path
is commit 6.

First step of the six-commit docs-to-Antora migration plan. See
issue #494.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docs/.docker/Dockerfile            | 37 ++++++++++++++++++++++++++++++
 docs/antora-playbook.yml           | 26 +++++++++++++++++++++
 docs/antora.yml                    |  5 ++++
 docs/modules/ROOT/nav.adoc         | 13 +++++++++++
 docs/modules/ROOT/pages/index.adoc | 34 +++++++++++++++++++++++++++
 5 files changed, 115 insertions(+)
 create mode 100644 docs/.docker/Dockerfile
 create mode 100644 docs/antora-playbook.yml
 create mode 100644 docs/antora.yml
 create mode 100644 docs/modules/ROOT/nav.adoc
 create mode 100644 docs/modules/ROOT/pages/index.adoc

diff --git a/docs/.docker/Dockerfile b/docs/.docker/Dockerfile
new file mode 100644
index 00000000..67c21ba6
--- /dev/null
+++ b/docs/.docker/Dockerfile
@@ -0,0 +1,37 @@
+FROM node:20-alpine
+
+LABEL org.opencontainers.image.title="SKaiNET Antora" \
+      org.opencontainers.image.description="Antora site generator with built-in Mermaid rendering" \
+      org.opencontainers.image.source="https://github.com/SKaiNET-developers/SKaiNET-transformers"
+
+# Chromium for mermaid-cli (puppeteer)
+RUN apk add --no-cache chromium font-noto
+
+ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser \
+    PUPPETEER_SKIP_DOWNLOAD=true
+
+# Install Antora + extensions to /opt/antora (not /antora which gets volume-mounted)
+WORKDIR /opt/antora
+RUN npm init -y && npm i --save-exact \
+      @antora/cli@3.1 \
+      @antora/site-generator@3.1 \
+      asciidoctor-kroki@0.18 \
+      @mermaid-js/mermaid-cli@11 \
+    && npm cache clean --force
+
+# Make installed modules visible when workdir is the mounted project
+ENV NODE_PATH=/opt/antora/node_modules
+
+# Mermaid-cli config
+RUN echo '{ \
+  "executablePath": "/usr/bin/chromium-browser", \
+  "args": ["--no-sandbox", "--disable-gpu", "--disable-dev-shm-usage"] \
+}' > /opt/antora/puppeteer-config.json
+
+# Verify mermaid works
+RUN echo 'graph TD; A-->B;' > /tmp/test.mmd \
+    && npx mmdc -i /tmp/test.mmd -o /tmp/test.svg -p /opt/antora/puppeteer-config.json \
+    && rm /tmp/test.mmd /tmp/test.svg
+
+ENTRYPOINT ["/opt/antora/node_modules/.bin/antora"]
+CMD ["--stacktrace", "antora-playbook.yml"]
diff --git a/docs/antora-playbook.yml b/docs/antora-playbook.yml
new file mode 100644
index 00000000..4c7b9bca
--- /dev/null
+++ b/docs/antora-playbook.yml
@@ -0,0 +1,26 @@
+site:
+  title: SKaiNET
+  start_page: skainet::index.adoc
+
+content:
+  sources:
+    - url: /antora
+      start_path: docs
+      branches: HEAD
+
+asciidoc:
+  extensions:
+    - asciidoctor-kroki
+  attributes:
+    # Use local mermaid-cli via Kroki (no external server needed when
+    # built with the custom Docker image in docs/.docker/Dockerfile —
+    # copied verbatim from SKaiNET-transformers).
+    kroki-fetch-diagram: true
+
+ui:
+  bundle:
+    url: https://gitlab.com/antora/antora-ui-default/-/jobs/artifacts/HEAD/raw/build/ui-bundle.zip?job=bundle-stable
+    snapshot: true
+
+output:
+  dir: ./build/site
diff --git a/docs/antora.yml b/docs/antora.yml
new file mode 100644
index 00000000..05bf9566
--- /dev/null
+++ b/docs/antora.yml
@@ -0,0 +1,5 @@
+name: skainet
+title: SKaiNET
+version: ~
+nav:
+  - modules/ROOT/nav.adoc
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
new file mode 100644
index 00000000..49fd1479
--- /dev/null
+++ b/docs/modules/ROOT/nav.adoc
@@ -0,0 +1,13 @@
+* xref:index.adoc[Overview]
+
+.Tutorials
+* (pages migrated in a later commit)
+
+.How-to guides
+* (pages migrated in a later commit)
+
+.Reference
+* (pages migrated in a later commit)
+
+.Explanation
+* (pages migrated in a later commit)
diff --git a/docs/modules/ROOT/pages/index.adoc b/docs/modules/ROOT/pages/index.adoc
new file mode 100644
index 00000000..fd1fda7e
--- /dev/null
+++ b/docs/modules/ROOT/pages/index.adoc
@@ -0,0 +1,34 @@
+= SKaiNET
+:description: Kotlin Multiplatform tensor engine with a graph IR, pluggable backends, and StableHLO export.
+
+SKaiNET is a Kotlin Multiplatform tensor / compile / graph engine.
+It provides a tensor DSL, execution contexts, a graph IR, model
+loaders (GGUF, SafeTensors, ONNX), quantization primitives
+(Q4_K, Q8_0, ternary, TurboQuant), a StableHLO emitter for cross-
+platform compile targets, and a pluggable backend API that CPU,
+GPU, and NPU backends can implement independently.
+
+This documentation site is organized following the
+https://diataxis.fr/[Diátaxis / Divio framework]:
+
+Tutorials:: Learning-oriented. Start here if you are new to SKaiNET.
+How-to guides:: Task-oriented. Recipes for solving specific problems.
+Reference:: Information-oriented. Looking up APIs and op coverage.
+Explanation:: Understanding-oriented. Background on design decisions.
+
+[NOTE]
+====
+LLM-specific runtimes (Llama, Gemma, Qwen, BERT) live in the
+sibling https://github.com/SKaiNET-developers/SKaiNET-transformers[SKaiNET-transformers]
+repository and its own documentation site. This site covers the
+engine layer only.
+====
+
+== Quick links
+
+* link:../api/index.html[API reference (Dokka)] (bundled at publish time)
+
+// The Tutorials / How-to / Reference / Explanation pages plus the
+// operator coverage xref land in follow-up commits (#2 and #3 of
+// the Antora migration). This page ships the landing copy first so
+// the scaffold build succeeds with a real start_page.

From 18f9c80dca1df8165c9cb54865869fb4fb4f5734 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Mon, 13 Apr 2026 15:33:00 +0200
Subject: [PATCH 2/6] Migrate docs to Antora Divio layout (#494 step 2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Moves every existing doc file under `docs/` into the Antora
`modules/ROOT/pages/` tree, organized by the four Diataxis
categories:

  Tutorials/   — getting-started and DSL learning paths
  How-to/      — task-oriented recipes (build, cli, I/O, training, Arduino)
  Reference/   — architecture, api stub pointing at Dokka
  Explanation/ — design articles, theory, examples, perf, known issues

Markdown sources were converted to AsciiDoc via pandoc 3.9
(run through the `pandoc/core` Docker image for determinism).
Pre-existing `.adoc` files were git-mv'd to preserve history.
Image assets (SKaiNET-compiler.svg, SKaiNET-logo.png) moved
to `docs/modules/ROOT/images/` and `image::` references
adjusted to Antora-relative paths.

Two structural cleanups handled in the same commit:

- `[source,mermaid]` fences bulk-rewritten to `[mermaid]` so
  asciidoctor-kroki recognizes them. 5 diagrams touched across
  hlo-getting-started.adoc, arduino-c-codegen.adoc, and
  operator-design.adoc.
- Literal `xref::` and `include::` mentions in the prose of
  operator-design.adoc escaped to backticks so Antora stops
  parsing them as real xrefs. This was the one build error
  after the initial pandoc pass.

Root `ARCHITECTURE.md` was a 4-line stub — its content moved
to `reference/architecture.adoc` with a cleaner title and a
pointer comment. The root file now contains a 3-line link
to the published page under the new site URL.

Nav populated with all four Divio sections. A new
`reference/api.adoc` page stubs the link to the Dokka-generated
API (which will be bundled at publish time by commit 6).

Local build verified:

    docker run --rm -v "$PWD:/antora" -w /antora \
        skainet-antora:local docs/antora-playbook.yml

Zero errors. 13 non-blocking warnings remain as cleanup debt:
pandoc section-level artifacts in skainet-for-ai.adoc and
arduino-c-codegen.adoc, missing attribute references
(framework_name, ksp_version, dokka_version,
asciidoctorj_version) in operator-design.adoc from the
original author, and one kroki mermaid 400 from kroki.io on
a large diagram. All tracked for a follow-up cleanup pass.

Second step of the six-commit docs-to-Antora migration plan.
See issue #494.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 ARCHITECTURE.md                               |   6 +-
 docs/arduino-c-codegen.md                     |  75 -------
 docs/build_help.md                            |  81 -------
 docs/kllama-getting-started.md                |  22 --
 .../ROOT/images}/SKaiNET-compiler.svg         |   0
 .../ROOT/images}/SKaiNET-logo.png             | Bin
 docs/modules/ROOT/nav.adoc                    |  25 ++-
 .../pages/explanation}/examples/index.adoc    |   6 +-
 .../pages/explanation/examples/matmul.adoc}   |   0
 .../issues/native-macos-accelerate-simd.adoc} | 131 ++++++------
 .../pages/explanation/operator-design.adoc}   |  12 +-
 .../explanation/perf/java-25-cpu-backend.adoc | 111 ++++++++++
 .../ROOT/pages/explanation/perf/jvm-cpu.adoc  | 110 ++++++++++
 .../pages/explanation/skainet-for-ai.adoc     | 143 +++++++++++++
 .../ROOT/pages/explanation}/theory/index.adoc |   8 +-
 .../pages/explanation}/theory/matmul.adoc     |   0
 .../ROOT/pages/how-to/arduino-c-codegen.adoc  |  95 +++++++++
 docs/modules/ROOT/pages/how-to/build.adoc     |  87 ++++++++
 .../ROOT/pages/how-to/io-readers.adoc}        | 153 ++++++++------
 .../ROOT/pages/how-to/java-cli-app.adoc}      |  90 ++++----
 .../pages/how-to/java-llm-inference.adoc}     | 163 ++++++++-------
 .../pages/how-to/java-model-training.adoc}    | 197 ++++++++++--------
 docs/modules/ROOT/pages/reference/api.adoc    |  19 ++
 .../ROOT/pages/reference/architecture.adoc    |  11 +
 .../ROOT/pages/tutorials/graph-dsl.adoc}      |  49 +++--
 .../pages/tutorials/hlo-getting-started.adoc} | 191 +++++++++--------
 .../tutorials/java-getting-started.adoc}      | 109 +++++-----
 .../tutorials/kllama-getting-started.adoc     |  26 +++
 docs/nav.adoc                                 |  50 -----
 docs/perf/java-25-cpu-backend.md              |  99 ---------
 docs/perf/jvm-cpu.md                          |  94 ---------
 docs/skainet-4-ai.md                          | 127 -----------
 32 files changed, 1232 insertions(+), 1058 deletions(-)
 delete mode 100644 docs/arduino-c-codegen.md
 delete mode 100644 docs/build_help.md
 delete mode 100644 docs/kllama-getting-started.md
 rename docs/{ => modules/ROOT/images}/SKaiNET-compiler.svg (100%)
 rename docs/{ => modules/ROOT/images}/SKaiNET-logo.png (100%)
 rename docs/{ => modules/ROOT/pages/explanation}/examples/index.adoc (87%)
 rename docs/{examples/matmul-examples.adoc => modules/ROOT/pages/explanation/examples/matmul.adoc} (100%)
 rename docs/{issues/native-macos-accelerate-simd.md => modules/ROOT/pages/explanation/issues/native-macos-accelerate-simd.adoc} (51%)
 rename docs/{ops-docs.adoc => modules/ROOT/pages/explanation/operator-design.adoc} (96%)
 create mode 100644 docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc
 create mode 100644 docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc
 create mode 100644 docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc
 rename docs/{ => modules/ROOT/pages/explanation}/theory/index.adoc (80%)
 rename docs/{ => modules/ROOT/pages/explanation}/theory/matmul.adoc (100%)
 create mode 100644 docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc
 create mode 100644 docs/modules/ROOT/pages/how-to/build.adoc
 rename docs/{io-readers-guide.md => modules/ROOT/pages/how-to/io-readers.adoc} (88%)
 rename docs/{java-cli-app.md => modules/ROOT/pages/how-to/java-cli-app.adoc} (85%)
 rename docs/{java-llm-inference.md => modules/ROOT/pages/how-to/java-llm-inference.adoc} (78%)
 rename docs/{java-model-training.md => modules/ROOT/pages/how-to/java-model-training.adoc} (80%)
 create mode 100644 docs/modules/ROOT/pages/reference/api.adoc
 create mode 100644 docs/modules/ROOT/pages/reference/architecture.adoc
 rename docs/{graph-dsl.md => modules/ROOT/pages/tutorials/graph-dsl.adoc} (77%)
 rename docs/{hlo-getting-started.md => modules/ROOT/pages/tutorials/hlo-getting-started.adoc} (59%)
 rename docs/{java-getting-started.md => modules/ROOT/pages/tutorials/java-getting-started.adoc} (78%)
 create mode 100644 docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc
 delete mode 100644 docs/nav.adoc
 delete mode 100644 docs/perf/java-25-cpu-backend.md
 delete mode 100644 docs/perf/jvm-cpu.md
 delete mode 100644 docs/skainet-4-ai.md

diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 564b49fc..aecc5c0f 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -1,4 +1,4 @@
-# 🏗️ Architecture
-SKaiNET uses a hybrid backend strategy that separates development iteration from production deployment.
+# Architecture
 
-![Architecture diagram of SKaiNET compiler](docs/SKaiNET-compiler.svg)
+See the published site:
+https://skainet-developers.github.io/SKaiNET/skainet/reference/architecture.html
diff --git a/docs/arduino-c-codegen.md b/docs/arduino-c-codegen.md
deleted file mode 100644
index 5bc9eda8..00000000
--- a/docs/arduino-c-codegen.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Arduino C Code Generation
-
-SKaiNET provides a specialized compiler backend for exporting trained neural networks to highly optimized, standalone C99 code suitable for microcontrollers like Arduino.
-
-## Overview
-
-The Arduino C code generation process transforms a high-level Kotlin model into a memory-efficient C implementation. It prioritizes static memory allocation, minimal overhead, and numerical consistency with the original model.
-
-### Codegen Pipeline
-
-```mermaid
-graph TD
-    A[Kotlin Model] --> B[Recording Pass]
-    B --> C[Execution Tape]
-    C --> D[Compute Graph]
-    D --> E[Graph Validation]
-    E --> F[Memory Layout Calculation]
-    F --> G[C Code Emission]
-    G --> H[Arduino Library Packaging]
-    H --> I[Generated .h/.c files]
-```
-
-## Technical Deep Dive
-
-### 1. Tape-based Tracing
-Instead of static analysis of the Kotlin code, SKaiNET uses a dynamic tracing mechanism. When you call `exportToArduinoLibrary`, the framework executes a single forward pass of your model using a specialized `RecordingContext`.
-- Every operation (Dense, ReLU, etc.) is recorded onto an **Execution Tape**.
-- This approach handles Kotlin's language features (loops, conditionals) naturally, as it only records the actual operations that were executed.
-
-### 2. Compute Graph Construction
-The execution tape is converted into a directed acyclic graph (DAG) called `ComputeGraph`.
-- Nodes represent operations (Ops).
-- Edges represent data flow (Tensors).
-- During this phase, the compiler performs **Shape Inference** to ensure every tensor has a fixed, known size.
-
-### 3. Static Memory Management
-Microcontrollers typically have very limited RAM and lack robust heap management. SKaiNET uses a **Ping-Pong Buffer Strategy** to eliminate dynamic memory allocation (`malloc`/`free`) during inference.
-
-#### Ping-Pong Buffer Strategy
-The compiler calculates the maximum size required for any intermediate tensor in the graph and allocates exactly two static buffers of that size.
-
-```mermaid
-sequenceDiagram
-    participant I as Input
-    participant B1 as Buffer A
-    participant B2 as Buffer B
-    participant O as Output
-    
-    I->>B1: Layer 1 (Input -> A)
-    B1->>B2: Layer 2 (A -> B)
-    B2->>B1: Layer 3 (B -> A)
-    B1->>O: Layer 4 (A -> Output)
-```
-
-- **Buffer Reuse**: Instead of allocating space for every layer's output, buffers are reused.
-- **Direct Output Optimization**: The first layer reads from the input pointer, and the last layer writes directly to the output pointer, avoiding unnecessary copies.
-
-### 4. Code Generation (Emission)
-The `CCodeGenerator` emits C99-compatible code using templates.
-- **Weights & Biases**: Extracted from the trained Kotlin model and serialized as `static const float` arrays. This places them in Flash memory (PROGMEM) on many microcontrollers, saving precious RAM.
-- **Kernel Implementation**: Operations like `Dense` (Linear) are implemented as optimized nested loops.
-- **Header Generation**: Produces a clean API for the user:
-  ```c
-  int model_inference(const float* input, float* output);
-  ```
-
-### 5. Validation
-The generator performs post-generation validation:
-- **Static Allocation Check**: Ensures no dynamic allocation is present in the generated source.
-- **Buffer Alternation Check**: Verifies that the ping-pong strategy is correctly implemented without data races or overwrites.
-
-## Performance and Constraints
-- **Floating Point**: Currently optimized for `FP32`.
-- **Supported Ops**: `Dense`, `ReLU`, `Sigmoid`, `Tanh`, `Add`, `MatMul`.
-- **Memory**: Total memory consumption is `TotalWeights + 2 * MaxIntermediateTensor`.
diff --git a/docs/build_help.md b/docs/build_help.md
deleted file mode 100644
index b6413e73..00000000
--- a/docs/build_help.md
+++ /dev/null
@@ -1,81 +0,0 @@
-# Build Help
-
-## Dokka API Documentation
-
-SKaiNET uses [Dokka 2.1.0](https://github.com/Kotlin/dokka) to generate API reference documentation across all public library modules. A shared convention plugin (`sk.ainet.dokka`) standardises the configuration.
-
-### Generating docs locally
-
-**Single module:**
-
-```bash
-./gradlew :skainet-lang:skainet-lang-core:dokkaGeneratePublicationHtml
-```
-
-Output: `skainet-lang/skainet-lang-core/build/dokka/html/`
-
-**Aggregated (all modules):**
-
-```bash
-./gradlew dokkaGenerate
-```
-
-Output: `build/dokka/html/index.html`
-
-### Convention plugin details
-
-The `sk.ainet.dokka` precompiled script plugin (`build-logic/convention/src/main/kotlin/sk.ainet.dokka.gradle.kts`) applies `org.jetbrains.dokka` and configures:
-
-- **moduleName** from `project.name`
-- **moduleVersion** from the `VERSION_NAME` Gradle property
-- **Documented visibilities:** public only
-- **Suppressed generated files:** KSP-generated code is excluded
-- **Suppressed native source sets:** `iosArm64Main`, `iosSimulatorArm64Main`, `macosArm64Main`, `linuxX64Main`, `linuxArm64Main` are suppressed because Dokka 2.x cannot translate native cinterop symbols
-- **Source links** pointing to the GitHub repository
-
-### Modules with Dokka enabled
-
-The plugin is applied to 21 library modules:
-
-| Group | Modules |
-|-------|---------|
-| skainet-lang | `skainet-lang-core`, `skainet-lang-models`, `skainet-lang-ksp-annotations`, `skainet-lang-dag` |
-| skainet-compile | `skainet-compile-core`, `skainet-compile-dag`, `skainet-compile-json`, `skainet-compile-hlo`, `skainet-compile-c` |
-| skainet-backends | `skainet-backend-cpu` |
-| skainet-data | `skainet-data-api`, `skainet-data-transform`, `skainet-data-simple`, `skainet-data-media` |
-| skainet-io | `skainet-io-core`, `skainet-io-gguf`, `skainet-io-image`, `skainet-io-onnx`, `skainet-io-safetensors` |
-| Other | `skainet-pipeline`, `skainet-model-yolo` |
-
-**Excluded:** `skainet-bom` (no source), `skainet-apps/*`, `skainet-test/*`, benchmarks, and `skainet-lang-ksp-processor` (internal).
-
-### Root-level aggregation
-
-The root `build.gradle.kts` applies the Dokka plugin directly (not `apply false`) and declares `dokka(project(...))` dependencies for all 21 modules. Running `./gradlew dokkaGenerate` at the root produces a unified API reference that includes every module under a single `SKaiNET` namespace. The root `README.md` is included as the landing page.
-
-### KSP interaction
-
-`skainet-lang-core` and `skainet-lang-dag` use KSP to generate source code. Their build files include:
-
-```kotlin
-tasks.matching { it.name.startsWith("dokka") }.configureEach {
-    dependsOn("kspCommonMainKotlinMetadata")
-}
-```
-
-This ensures KSP-generated sources are available before Dokka runs.
-
-### GitHub Pages deployment
-
-The workflow `.github/workflows/dokka-pages.yml` runs on push to `main` (and manually via `workflow_dispatch`). It:
-
-1. Checks out the repo
-2. Sets up JDK 25
-3. Runs `./gradlew dokkaGenerate`
-4. Uploads the `build/dokka/html` directory as a Pages artifact
-5. Deploys to GitHub Pages using `actions/deploy-pages@v4`
-
-**Prerequisite:** The repository must have Pages configured to deploy from GitHub Actions (Settings > Pages > Source: "GitHub Actions").
-
-### Operator docs (unchanged)
-
-The existing operator documentation pipeline (`./gradlew generateDocs`) is unrelated to Dokka and continues to work as before.
diff --git a/docs/kllama-getting-started.md b/docs/kllama-getting-started.md
deleted file mode 100644
index 7e7fb8e9..00000000
--- a/docs/kllama-getting-started.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# KLlama Getting Started
-
-KLlama is a pure Kotlin LLaMA inference runtime that runs on JVM, Native, JS, and WebAssembly. It supports GGUF, SafeTensors, and Karpathy .bin model formats with on-the-fly quantization support.
-
-> **Early Stage Development**: The project is in active development. We appreciate your feedback and bug reports!
-
-## Choose Your Path
-
-| Goal | Guide |
-|---|---|
-| **Run models from the command line** | [KLlama CLI](../skainet-apps/skainet-kllama-cli/README.md) |
-| **Embed in a Kotlin application** | [KLlama Library](../skainet-apps/skainet-kllama/README.md) |
-| **Embed in a Java application** | [Java LLM Inference Guide](java-llm-inference.md) |
-| **Build a standalone Java CLI app** | [Java CLI App Guide](java-cli-app.md) |
-| **Java project setup (Maven / Gradle)** | [Java Getting Started](java-getting-started.md) |
-
-## Quick Links
-
-- [Supported formats & quantization](../skainet-apps/skainet-kllama/README.md#supported-formats--quantization)
-- [Custom backend integration](../skainet-apps/skainet-kllama/README.md#custom-backend-integration)
-- [Agent & tool calling](java-llm-inference.md#agent-loop-and-tool-calling)
-- [BERT embeddings & similarity](java-llm-inference.md#bert-encoding-and-similarity)
diff --git a/docs/SKaiNET-compiler.svg b/docs/modules/ROOT/images/SKaiNET-compiler.svg
similarity index 100%
rename from docs/SKaiNET-compiler.svg
rename to docs/modules/ROOT/images/SKaiNET-compiler.svg
diff --git a/docs/SKaiNET-logo.png b/docs/modules/ROOT/images/SKaiNET-logo.png
similarity index 100%
rename from docs/SKaiNET-logo.png
rename to docs/modules/ROOT/images/SKaiNET-logo.png
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 49fd1479..64b78995 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -1,13 +1,30 @@
 * xref:index.adoc[Overview]
 
 .Tutorials
-* (pages migrated in a later commit)
+* xref:tutorials/java-getting-started.adoc[Java getting started]
+* xref:tutorials/kllama-getting-started.adoc[KLlama getting started]
+* xref:tutorials/hlo-getting-started.adoc[StableHLO getting started]
+* xref:tutorials/graph-dsl.adoc[Graph DSL]
 
 .How-to guides
-* (pages migrated in a later commit)
+* xref:how-to/build.adoc[Build from source]
+* xref:how-to/io-readers.adoc[Load models (GGUF, SafeTensors, ONNX)]
+* xref:how-to/java-cli-app.adoc[Build a Java CLI app]
+* xref:how-to/java-llm-inference.adoc[Run LLM inference]
+* xref:how-to/java-model-training.adoc[Train a model]
+* xref:how-to/arduino-c-codegen.adoc[Generate C for Arduino]
 
 .Reference
-* (pages migrated in a later commit)
+* xref:reference/architecture.adoc[Architecture]
+* xref:reference/api.adoc[API reference (Dokka)]
 
 .Explanation
-* (pages migrated in a later commit)
+* xref:explanation/skainet-for-ai.adoc[SKaiNET for AI/ML]
+* xref:explanation/operator-design.adoc[Operator documentation system]
+* xref:explanation/theory/index.adoc[Mathematical theory]
+** xref:explanation/theory/matmul.adoc[Matrix multiplication]
+* xref:explanation/examples/index.adoc[Worked examples]
+** xref:explanation/examples/matmul.adoc[Matrix multiplication examples]
+* xref:explanation/perf/jvm-cpu.adoc[JVM CPU performance]
+* xref:explanation/perf/java-25-cpu-backend.adoc[Java 25 CPU backend notes]
+* xref:explanation/issues/native-macos-accelerate-simd.adoc[Native macOS Accelerate SIMD issues]
diff --git a/docs/examples/index.adoc b/docs/modules/ROOT/pages/explanation/examples/index.adoc
similarity index 87%
rename from docs/examples/index.adoc
rename to docs/modules/ROOT/pages/explanation/examples/index.adoc
index 97630bec..36946d7d 100644
--- a/docs/examples/index.adoc
+++ b/docs/modules/ROOT/pages/explanation/examples/index.adoc
@@ -7,7 +7,7 @@ This section contains practical examples and usage patterns for SKaiNET operator
 
 === Linear Algebra
 
-include::matmul-examples.adoc[leveloffset=+2]
+include::matmul.adoc[leveloffset=+2]
 
 === Tensor Creation and Manipulation
 
@@ -53,5 +53,5 @@ include::matmul-examples.adoc[leveloffset=+2]
 [#cross-references]
 == Cross-References
 
-* xref:../theory/index.adoc[Mathematical Theory]
-* xref:../modules/operators/_generated_/index.adoc[Generated API Reference]
\ No newline at end of file
+* xref:explanation/theory/index.adoc[Mathematical Theory]
+// Operator reference lands in a later commit of the Antora migration.
\ No newline at end of file
diff --git a/docs/examples/matmul-examples.adoc b/docs/modules/ROOT/pages/explanation/examples/matmul.adoc
similarity index 100%
rename from docs/examples/matmul-examples.adoc
rename to docs/modules/ROOT/pages/explanation/examples/matmul.adoc
diff --git a/docs/issues/native-macos-accelerate-simd.md b/docs/modules/ROOT/pages/explanation/issues/native-macos-accelerate-simd.adoc
similarity index 51%
rename from docs/issues/native-macos-accelerate-simd.md
rename to docs/modules/ROOT/pages/explanation/issues/native-macos-accelerate-simd.adoc
index b0317c92..4fa01b33 100644
--- a/docs/issues/native-macos-accelerate-simd.md
+++ b/docs/modules/ROOT/pages/explanation/issues/native-macos-accelerate-simd.adoc
@@ -1,6 +1,6 @@
-# Native macOS SIMD acceleration via Apple Accelerate framework
+== Native macOS SIMD acceleration via Apple Accelerate framework
 
-## Problem
+=== Problem
 
 The `skainet-backend-cpu` module on Kotlin/Native macOS (macosArm64) uses plain scalar loops
 for all tensor operations (`DefaultCpuOps`). On JVM, the same module uses the JDK Vector API
@@ -11,71 +11,76 @@ When running LLM inference benchmarks via the `llm-performance` native binary, t
 is 5-10x slower than it needs to be because every matmul is a triple-nested scalar loop
 (`DefaultCpuOps.kt:264-272`).
 
-## Proposed solution
+=== Proposed solution
 
 Add an Accelerate-backed `TensorOps` implementation for the macOS native target, mirroring
 how the JVM target has `DefaultCpuOpsJvm`. Apple's Accelerate framework provides
 hardware-optimized BLAS and vector DSP routines that leverage ARM NEON and AMX under the hood.
 
-### Architecture
+==== Architecture
 
-```
+....
 PlatformCpuOpsFactory
   ├── jvmMain   → DefaultCpuOpsJvm (Vector API + optional BLAS)     ← exists
   ├── nativeMain → DefaultCpuOps (scalar fallback)                   ← exists
   ├── macosMain  → AccelerateCpuOps (Accelerate framework via cinterop)  ← NEW
   └── linuxMain  → DefaultCpuOps (scalar, or OpenBLAS in future)    ← unchanged
-```
+....
 
-### Key changes
+==== Key changes
 
-**1. Cinterop definition** — `src/nativeInterop/cinterop/accelerate.def`
+*1. Cinterop definition* — `src/nativeInterop/cinterop/accelerate.def`
 
-```def
+[source,def]
+----
 package = platform.accelerate
 language = C
 headers = Accelerate/Accelerate.h
 compilerOpts = -framework Accelerate
 linkerOpts = -framework Accelerate
-```
+----
 
-**2. New class** — `src/macosMain/kotlin/.../AccelerateCpuOps.kt`
+*2. New class* — `src/macosMain/kotlin/.../AccelerateCpuOps.kt`
 
 Extends `DefaultCpuOps` and overrides hot-path operations with Accelerate calls:
 
-| Priority | Operation | Accelerate function | Impact |
-|----------|-----------|---------------------|--------|
-| P0 | `matmul` | `cblas_sgemm` | Dominant cost in LLM inference (~90% of forward pass) |
-| P1 | `add` | `vDSP_vadd` | Elementwise add (residual connections) |
-| P1 | `multiply` | `vDSP_vmul` | Elementwise multiply (gates, scaling) |
-| P1 | `subtract` | `vDSP_vsub` | Elementwise subtract |
-| P1 | `divide` | `vDSP_vdiv` | Elementwise divide |
-| P2 | `sum` (global) | `vDSP_sve` | Reduction for normalization |
-| P2 | `mean` (global) | `vDSP_meanv` | Reduction for normalization |
-| P2 | `softmax` | `vDSP_vse` + manual | Attention weights |
-| P3 | `relu` | `vDSP_vthres` / `vDSP_vthr` | Activation function |
-| P3 | `silu` | manual vectorized loop | Activation function (SiLU = x * sigmoid(x)) |
-| P3 | `transpose` | `vDSP_mtrans` | Matrix transpose |
-
-**3. Platform factory** — update `PlatformCpuOpsFactory` for macOS
-
-```kotlin
+[cols=",,,",options="header",]
+|===
+|Priority |Operation |Accelerate function |Impact
+|P0 |`matmul` |`cblas++_++sgemm` |Dominant cost in LLM inference (~90% of forward pass)
+|P1 |`add` |`vDSP++_++vadd` |Elementwise add (residual connections)
+|P1 |`multiply` |`vDSP++_++vmul` |Elementwise multiply (gates, scaling)
+|P1 |`subtract` |`vDSP++_++vsub` |Elementwise subtract
+|P1 |`divide` |`vDSP++_++vdiv` |Elementwise divide
+|P2 |`sum` (global) |`vDSP++_++sve` |Reduction for normalization
+|P2 |`mean` (global) |`vDSP++_++meanv` |Reduction for normalization
+|P2 |`softmax` |`vDSP++_++vse` {plus} manual |Attention weights
+|P3 |`relu` |`vDSP++_++vthres` / `vDSP++_++vthr` |Activation function
+|P3 |`silu` |manual vectorized loop |Activation function (SiLU = x ++*++ sigmoid(x))
+|P3 |`transpose` |`vDSP++_++mtrans` |Matrix transpose
+|===
+
+*3. Platform factory* — update `PlatformCpuOpsFactory` for macOS
+
+[source,kotlin]
+----
 // src/macosMain/kotlin/.../PlatformCpuOpsFactory.macos.kt
 internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps {
     println("[SKaiNET] Using Accelerate-backed CPU operations (ARM NEON + AMX)")
     return { factory -> AccelerateCpuOps(factory) }
 }
-```
+----
 
 This requires splitting the current `nativeMain` expect/actual into separate
 `macosMain` and `linuxMain` actuals (the `macosMain` source set already exists in
 `build.gradle.kts`).
 
-**4. Build changes** — `build.gradle.kts`
+*4. Build changes* — `build.gradle.kts`
 
 Add cinterop configuration for macosArm64 (and optionally iosArm64/iosSimulatorArm64):
 
-```kotlin
+[source,kotlin]
+----
 macosArm64 {
     compilations["main"].cinterops {
         val accelerate by creating {
@@ -83,43 +88,43 @@ macosArm64 {
         }
     }
 }
-```
+----
 
 Add linker opts for the Accelerate framework to all macOS/iOS binaries.
 
-### Implementation notes
+==== Implementation notes
 
-- `AccelerateCpuOps` should extend `DefaultCpuOps` and override only the operations above.
-  Non-accelerated operations fall through to the scalar implementation.
-- The `matmul` override should handle 2D FP32 tensors with `cblas_sgemm` and delegate
-  batched/non-float cases to `super.matmul()`.
-- `vDSP_*` functions operate on contiguous `FloatArray` buffers. Tensors backed by
-  `FloatArrayTensorData` can be passed directly; others need a `toFloatArray()` copy.
-- Broadcasting logic (e.g., bias add, scalar multiply) should remain in the Kotlin layer
-  and only dispatch the contiguous inner loop to Accelerate.
-- The same approach works for iOS targets (`iosArm64`, `iosSimulatorArm64`) since
-  Accelerate is available on all Apple platforms.
+* `AccelerateCpuOps` should extend `DefaultCpuOps` and override only the operations above.
+Non-accelerated operations fall through to the scalar implementation.
+* The `matmul` override should handle 2D FP32 tensors with `cblas++_++sgemm` and delegate
+batched/non-float cases to `super.matmul()`.
+* `vDSP++_*++` functions operate on contiguous `FloatArray` buffers. Tensors backed by
+`FloatArrayTensorData` can be passed directly; others need a `toFloatArray()` copy.
+* Broadcasting logic (e.g., bias add, scalar multiply) should remain in the Kotlin layer
+and only dispatch the contiguous inner loop to Accelerate.
+* The same approach works for iOS targets (`iosArm64`, `iosSimulatorArm64`) since
+Accelerate is available on all Apple platforms.
 
-### Testing
+==== Testing
 
-- Existing `DefaultCpuOps` tests in `commonTest` should pass unchanged (numerical equivalence).
-- Add macOS-specific tests verifying Accelerate dispatch actually occurs (e.g., check log output
-  or add a query method).
-- Benchmark comparison: run `llm-performance` native benchmark with the current scalar backend
-  vs Accelerate backend on the same model.
+* Existing `DefaultCpuOps` tests in `commonTest` should pass unchanged (numerical equivalence).
+* Add macOS-specific tests verifying Accelerate dispatch actually occurs (e.g., check log output
+or add a query method).
+* Benchmark comparison: run `llm-performance` native benchmark with the current scalar backend
+vs Accelerate backend on the same model.
 
-### Expected impact
+==== Expected impact
 
 Based on JVM BLAS vs scalar measurements and Apple's published Accelerate performance data:
 
-- **matmul**: 10-50x speedup (NEON + AMX vs scalar loop)
-- **elementwise**: 4-8x speedup (NEON vectorization)
-- **reductions**: 4-8x speedup (NEON vectorization)
-- **overall LLM inference**: 5-20x speedup on native macOS CPU backend
+* *matmul*: 10-50x speedup (NEON {plus} AMX vs scalar loop)
+* *elementwise*: 4-8x speedup (NEON vectorization)
+* *reductions*: 4-8x speedup (NEON vectorization)
+* *overall LLM inference*: 5-20x speedup on native macOS CPU backend
 
-### Files to create/modify
+==== Files to create/modify
 
-```
+....
 skainet-backends/skainet-backend-cpu/
 ├── build.gradle.kts                                          # add cinterop
 ├── src/nativeInterop/cinterop/accelerate.def                 # NEW
@@ -127,12 +132,12 @@ skainet-backends/skainet-backend-cpu/
 ├── src/macosMain/kotlin/.../PlatformCpuOpsFactory.macos.kt   # NEW
 ├── src/linuxMain/kotlin/.../PlatformCpuOpsFactory.linux.kt   # NEW (move from nativeMain)
 └── src/nativeMain/kotlin/.../PlatformCpuOpsFactory.native.kt # REMOVE (split to platform-specific)
-```
+....
 
-### References
+==== References
 
-- JVM SIMD implementation: `src/jvmMain/kotlin/.../DefaultCpuOpsJvm.kt`
-- JVM BLAS integration: `src/jvmMain/kotlin/.../JvmBlas.kt`
-- Apple Accelerate docs: https://developer.apple.com/documentation/accelerate
-- CBLAS reference: https://developer.apple.com/documentation/accelerate/blas
-- vDSP reference: https://developer.apple.com/documentation/accelerate/vdsp
+* JVM SIMD implementation: `src/jvmMain/kotlin/.../DefaultCpuOpsJvm.kt`
+* JVM BLAS integration: `src/jvmMain/kotlin/.../JvmBlas.kt`
+* Apple Accelerate docs: https://developer.apple.com/documentation/accelerate
+* CBLAS reference: https://developer.apple.com/documentation/accelerate/blas
+* vDSP reference: https://developer.apple.com/documentation/accelerate/vdsp
diff --git a/docs/ops-docs.adoc b/docs/modules/ROOT/pages/explanation/operator-design.adoc
similarity index 96%
rename from docs/ops-docs.adoc
rename to docs/modules/ROOT/pages/explanation/operator-design.adoc
index faa5d75c..6ce1d8d4 100644
--- a/docs/ops-docs.adoc
+++ b/docs/modules/ROOT/pages/explanation/operator-design.adoc
@@ -101,7 +101,7 @@ Your article must be written in AsciiDoc and include the following sections (use
 - Show how fragments embed:
   • An API signature block
   • A status table by backend
-  • Pointers (xref::) to human-written math/semantics sections
+  • Pointers (`xref:`) to human-written math/semantics sections
 
 - Provide example AsciiDoc fragment:
   [source,adoc]
@@ -126,7 +126,7 @@ Your article must be written in AsciiDoc and include the following sections (use
   See xref:theory/matmul.adoc#definition[MatMul semantics] and xref:examples/matmul.adoc#examples[Examples].
   ----
 
-- Demonstrate combining generated and human-written docs via include:: and xref::, with a small folder layout:
+- Demonstrate combining generated and human-written docs via `include::` and `xref:`, with a small folder layout:
   [source,text]
   ----
   docs/
@@ -147,7 +147,7 @@ Your article must be written in AsciiDoc and include the following sections (use
   • Human-written caveats that reference generated statuses via xref anchors.
 - Show a synchronization flow as a Mermaid diagram:
 
-  [source,mermaid]
+  [mermaid]
   ----
   flowchart LR
     A[Operator Interfaces (KMP)] --> B[KSP Processor]
@@ -226,7 +226,7 @@ Your article must be written in AsciiDoc and include the following sections (use
   ----
 
 - Show the KSP-produced JSON excerpt and the corresponding generated AsciiDoc fragment for at least one function (e.g., relu).
-- Give a minimal human-written math section for MatMul (dimensions, shapes, complexity), and show how it’s included via xref:: from the generated fragment.
+- Give a minimal human-written math section for MatMul (dimensions, shapes, complexity), and show how it is included via `xref:` from the generated fragment.
 
 == 8. Summary and Benefits
 - Summarize benefits:
@@ -246,7 +246,7 @@ APPENDIX (OPTIONAL BUT STRONGLY RECOMMENDED)
 
 OUTPUT FORMAT REQUIREMENTS
 - Write the entire article as **AsciiDoc**.
-- Use code blocks with language tags: [source,kotlin], [source,gradle], [source,json], [source,adoc], [source,mermaid], [source,plantuml] (PlantUML optional).
+- Use code blocks with language tags: [source,kotlin], [source,gradle], [source,json], [source,adoc], [mermaid], [source,plantuml] (PlantUML optional).
 - Use short paragraphs and bullet lists; avoid filler or marketing language.
 - Include at least:
   • One Mermaid diagram (the pipeline).
@@ -260,7 +260,7 @@ ACCEPTANCE CHECKLIST (the output must satisfy all)
 - [ ] Clear definition of “reflective documentation” and how it differs from classic docgen.
 - [ ] KSP plan, annotation semantics, and JSON Schema included.
 - [ ] Example Operator (TensorOps) + generated metadata + generated AsciiDoc fragment.
-- [ ] Demonstrated include:: and xref:: usage.
+- [ ] Demonstrated `include::` and `xref:` usage.
 - [ ] Mermaid pipeline diagram present.
 - [ ] Gradle/Dokka/AsciiDoctorJ integration details with code.
 - [ ] Summary articulates benefits and risks.
\ No newline at end of file
diff --git a/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc b/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc
new file mode 100644
index 00000000..2b74c01c
--- /dev/null
+++ b/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc
@@ -0,0 +1,111 @@
+==== Java 25 Advantages for the JVM CPU Backend
+
+Java 25 (GA September 2025) delivers significant free performance improvements to the
+SKaiNET JVM CPU backend through JIT/C2 optimizations, faster Panama FFI, and new GC/startup
+features — all without requiring code changes.
+
+===== Compatibility
+
+The same code, same flags, and same runtime detection work across JDK 21–25:
+
+* Vector API remains incubator on JDK 25 (JEP 508) — identical `jdk.incubator.vector` package.
+* Panama FFI finalized in JDK 22; `--enable-preview` is harmless on 22{plus}.
+* Runtime detection (`Class.forName`, `Runtime.version()`) works on all versions.
+* Build config (`jvmTarget = JVM++_++21`, `options.release.set(21)`) produces compatible bytecode.
+
+*No special treatment is needed for JDK ++>++= 21 but ++<++ 25.*
+
+Required flags remain:
+
+....
+--enable-preview --add-modules jdk.incubator.vector
+....
+
+[[jit--c2-improvements-mapped-to-skainet-ops]]
+===== JIT / C2 improvements mapped to SKaiNET ops
+
+These are automatic — the JIT produces better native code for existing bytecode.
+
+[cols=",,,",options="header",]
+|===
+|Improvement |JDK bug |Speedup |Affected SKaiNET code
+|VPointer refactoring for vector loads/stores |https://bugs.openjdk.org/browse/JDK-8350748[JDK-8350748] |up to 14x |All `FloatVector.fromArray` / `fromMemorySegment` loops in `JvmVectorKernels.kt`, `JvmQuantizedVectorKernels.kt`
+|SuperWord SIMD enhancement |https://bugs.openjdk.org/browse/JDK-8343685[JDK-8343685] |up to 33x |Same vectorized loops (elementwise, reductions, matmul inner loops)
+|`Math.max` / `Math.min` intrinsified for `long` |JDK-8350485 |3–5x |Shape computation, tile clamping in blocked matmul
+|===
+
+Source files:
+
+* `skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmVectorKernels.kt`
+* `skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt`
+
+===== Panama FFI improvements
+
+[cols=",,,",options="header",]
+|===
+|Improvement |JDK bug |Speedup |Affected SKaiNET code
+|Faster `MemorySegment` allocation |https://bugs.openjdk.org/browse/JDK-8345687[JDK-8345687] |~2x |`MemorySegmentTensorData.kt` (`MemorySegmentTensorDataFactory`), `PagedKvCache.kt`
+|`MemorySegment::fill` optimized on AArch64 |https://bugs.openjdk.org/browse/JDK-8354674[JDK-8354674] |~2.5x |Tensor zeroing, blocked matmul result initialization
+|===
+
+Source files:
+
+* `skainet-lang/skainet-lang-core/src/jvmMain/kotlin/sk/ainet/lang/tensor/data/MemorySegmentTensorData.kt`
+* `skainet-apps/skainet-kllama/src/jvmMain/kotlin/sk/ainet/apps/kllama/PagedKvCache.kt`
+
+===== Object layout and GC
+
+* *Compact Object Headers* (JEP 519) — reduces object header from 12 to 8 bytes.
+Meaningful for tensor metadata arrays with millions of small objects.
+Opt-in: `-XX:{plus}UseCompactObjectHeaders`
+* *Generational Shenandoah* (JEP 521) — lower GC pause times for allocation-heavy
+workloads (tensor creation, KV cache churn).
+Opt-in: `-XX:{plus}UseShenandoahGC -XX:ShenandoahGCMode=generational`
+
+===== Startup and warmup
+
+* *AOT profiling / caching* (JEP 515) — records JIT profile data from a training run
+and replays it on subsequent launches. Reduces warmup by 15–25%.
+Useful for CLI apps like kLLaMA where first-token latency matters.
+
+Usage:
+
+....
+# Training run (records profile)
+java -XX:AOTCacheOutput=app.aot -jar kllama.jar --prompt "warmup"
+
+# Production run (replays profile)
+java -XX:AOTCache=app.aot -jar kllama.jar --prompt "Hello"
+....
+
+===== Recommended JVM flags for Java 25
+
+Required (same as JDK 21–24):
+
+....
+--enable-preview
+--add-modules jdk.incubator.vector
+....
+
+Optional — enable for maximum benefit on JDK 25:
+
+....
+-XX:+UseCompactObjectHeaders
+-XX:+UseShenandoahGC -XX:ShenandoahGCMode=generational
+-XX:AOTCache=app.aot          # after training run
+....
+
+===== Summary
+
+[cols=",,",options="header",]
+|===
+|Feature |Benefit |Component
+|VPointer refactoring (C2) |Up to 14x faster vector loads/stores |`JvmVectorKernels`, `JvmQuantizedVectorKernels`
+|SuperWord SIMD (C2) |Up to 33x faster auto-vectorized loops |Same vector kernel files
+|`Math.max/min` intrinsic |3–5x faster long comparisons |Shape computation, tile clamping
+|Faster segment allocation |~2x allocation throughput |`MemorySegmentTensorDataFactory`, `PagedKvCache`
+|`MemorySegment::fill` (AArch64) |~2.5x faster bulk zeroing |Tensor init, matmul result buffers
+|Compact Object Headers |~30% smaller object headers |All tensor metadata
+|Generational Shenandoah |Lower GC pauses |Allocation-heavy inference
+|AOT profiling |15–25% faster warmup |CLI apps (kLLaMA)
+|===
diff --git a/docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc b/docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc
new file mode 100644
index 00000000..167aac22
--- /dev/null
+++ b/docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc
@@ -0,0 +1,110 @@
+==== JVM CPU Backend Performance Benchmarks (JMH)
+
+This page explains how to run the JMH benchmarks for the JVM CPU backend and how to capture evidence for performance targets.
+
+===== What’s included
+
+* Elementwise: FP32 `add` on 1,000,000 elements
+* Reductions: FP32 `sum` and `mean` on 1,000,000 elements
+* Matmul: FP32 square `matmul` with sizes 256, 512, and 1024
+
+Benchmarks are implemented in module:
+
+* `:skainet-backends:benchmarks:jvm-cpu-jmh`
+
+Source files:
+
+* `src/jmh/kotlin/sk/ainet/bench/ElementwiseAdd1MBench.kt`
+* `src/jmh/kotlin/sk/ainet/bench/Reductions1MBench.kt`
+* `src/jmh/kotlin/sk/ainet/bench/MatmulBench.kt`
+
+===== Prerequisites
+
+* JDK 21{plus} (JDK 22 toolchain configured by Gradle)
+* Gradle will pass required JVM flags:
+** `--enable-preview`
+** `--add-modules jdk.incubator.vector`
+
+For Java 25-specific performance advantages, see link:java-25-cpu-backend.md[Java 25 CPU Backend].
+
+===== Feature flags
+
+You can toggle acceleration paths at runtime using system properties or environment variables:
+
+* Vector acceleration:
+** `-Dskainet.cpu.vector.enabled=true++|++false`
+** or `SKAINET++_++CPU++_++VECTOR++_++ENABLED=true++|++false`
+* BLAS via Panama (matmul heuristic for larger sizes):
+** `-Dskainet.cpu.blas.enabled=true++|++false`
+** or `SKAINET++_++CPU++_++BLAS++_++ENABLED=true++|++false`
+
+Each benchmark also exposes `@Param` to toggle these flags without modifying Gradle args.
+
+===== How to run all benchmarks
+
+From repository root:
+
+....
+./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh
+....
+
+This will build and execute all JMH benchmarks with the default parameters defined in sources.
+
+===== Run specific benchmarks
+
+* Elementwise add (both vector on/off):
+
+....
+./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
+  -Pjmh.include=ElementwiseAdd1MBench
+....
+
+* Reductions (vector on/off):
+
+....
+./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
+  -Pjmh.include=Reductions1MBench
+....
+
+* Matmul, all sizes, with vector on and BLAS on:
+
+....
+./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
+  -Pjmh.include=MatmulBench \
+  -Pjmh.param.vectorEnabled=true \
+  -Pjmh.param.blasEnabled=true
+....
+
+* Matmul at 512 only, comparing BLAS on/off with vector on:
+
+....
+./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
+  -Pjmh.include=MatmulBench \
+  -Pjmh.param.size=512 \
+  -Pjmh.param.vectorEnabled=true \
+  -Pjmh.param.blasEnabled=true,false
+....
+
+Notes:
+
+* You can also pass system properties via `-D` if preferred (e.g., `-Dskainet.cpu.vector.enabled=false`).
+* JMH JSON/text results can be configured via standard JMH plugin options if you need files for CI artifacts.
+
+===== Recording environment details
+
+Include at minimum:
+
+* CPU model, cores/threads, base/boost clock
+* RAM size and speed
+* OS version
+* JDK version and vendor
+* Gradle version
+* JVM flags in use (`--enable-preview --add-modules jdk.incubator.vector`)
+* SKaiNET flags used (vector, BLAS)
+
+===== Performance targets (to be validated on your hardware)
+
+* ≥ 4× speedup on FP32 `matmul` 512×512 vs baseline scalar
+* ≥ 3× speedup on FP32 `add` with 1M elements vs baseline scalar
+
+Use the above commands to produce “vector=false/blas=false” baselines vs “vector=true++[++/blas=true++]++” accelerated runs. Capture best-of or median-of JMH results as evidence and include raw tables in this document when available.
diff --git a/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc b/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc
new file mode 100644
index 00000000..102aa5ac
--- /dev/null
+++ b/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc
@@ -0,0 +1,143 @@
+[[skainet-core-technology-tensor--data-guide]]
+== SKaiNET Core Technology: Tensor & Data Guide
+
+This document provides technical instructions for AI agents and developers on using SKaiNET's Tensor and Data API as a modern, type-safe replacement for NDArray or Python's NumPy library.
+
+[[1-fundamental-architecture-tensor-composition]]
+=== 1. Fundamental Architecture: Tensor Composition
+
+Unlike traditional libraries where a Tensor is a monolithic object, SKaiNET adopts a *compositional architecture*. A `Tensor++<++T, V++>++` is composed of two primary components:
+
+[arabic]
+. *`TensorData++<++T, V++>++`*: Handles multi-dimensional storage, memory layout, indexing, and type-safe element access.
+. *`TensorOps`*: Encapsulates mathematical algorithms and transformations (CPU, GPU, etc.).
+
+This separation allows for high flexibility, such as switching execution backends without changing the data representation.
+
+[source,kotlin]
+----
+interface Tensor<T : DType, V> {
+    val data: TensorData<T, V>
+    val ops: TensorOps
+    val dtype: KClass<T>
+    val shape: Shape
+}
+----
+
+[[2-type-safe-tensor-creation-dsl]]
+=== 2. Type-Safe Tensor Creation (DSL)
+
+SKaiNET provides a powerful Type-Safe DSL for tensor creation. It ensures that the data provided matches the specified `DType` at compile-time (or through the DSL's internal validation).
+
+==== Creation with `ExecutionContext`
+
+Tensors are always created within an `ExecutionContext`, which provides the necessary `TensorOps` and `TensorDataFactory`.
+
+[source,kotlin]
+----
+// Basic creation
+val zeros = ctx.zeros(Shape(2, 3), FP32::class)
+val ones = ctx.ones(Shape(1, 10), Int32::class)
+val full = ctx.full(Shape(5, 5), FP32::class, 42.0f)
+----
+
+==== Expressive Tensor DSL
+
+For more complex initializations, use the `tensor` DSL:
+
+[source,kotlin]
+----
+val myTensor = tensor(ctx, FP32::class) {
+    shape(2, 2) { 
+        from(1.0f, 2.0f, 3.0f, 4.0f) 
+    }
+}
+
+val randomTensor = tensor(ctx, FP32::class) {
+    shape(10, 10) { 
+        randn(mean = 0f, std = 1f) 
+    }
+}
+
+val customInit = tensor(ctx, Int32::class) {
+    shape(5, 5) {
+        init { indices -> indices[0] + indices[1] }
+    }
+}
+----
+
+[[3-slicing-dsl-api]]
+=== 3. Slicing DSL API
+
+SKaiNET offers a sophisticated Slicing DSL that allows for creating views or copies of tensor segments with high precision and readability.
+
+==== `sliceView` vs `sliceCopy`
+
+* *`sliceView`*: Creates a `TensorView`, which is a window into the original data (no data copying).
+* *`sliceCopy`*: Creates a new `Tensor` with a copy of the sliced data.
+
+==== Slicing DSL Syntax
+
+The `SegmentBuilder` provides several ways to define slices for each dimension:
+
+* `range(start, end)`: A range of indices.
+* `at(index)`: A single index (reduces rank).
+* `all()`: All elements in that dimension (equivalent to `:` in NumPy).
+* `step(start, end, step)`: Strided access.
+* `{plus}all()`: Short-hand for `all()`.
+
+[source,kotlin]
+----
+val source = ctx.ones(Shape(10, 20, 30), FP32::class)
+
+// Slicing: [0:5, 10, :]
+val view = source.sliceView {
+    segment { range(0, 5) } // Dim 0
+    segment { at(10) }      // Dim 1
+    segment { all() }       // Dim 2
+}
+----
+
+[[4-core-operations-tensorops]]
+=== 4. Core Operations (`TensorOps`)
+
+All mathematical operations are dispatched through the `TensorOps` interface. SKaiNET supports:
+
+* *Element-wise Ops*: `add`, `subtract`, `multiply`, `divide` (and scalar versions).
+* *Linear Algebra*: `matmul`, `transpose`.
+* *Neural Network Ops*: `conv2d`, `maxPool2d`, `relu`, `softmax`, `sigmoid`, `gelu`.
+* *Reductions*: `sum`, `mean`, `variance`.
+* *Shape Ops*: `reshape`, `flatten`, `concat`, `squeeze`, `unsqueeze`.
+
+==== Operator Overloading
+
+When a tensor is "bound" to ops (e.g., via `OpsBoundTensor`), you can use standard Kotlin operators:
+
+[source,kotlin]
+----
+val c = a + b  // Calls ops.add(a, b)
+val d = a * 10 // Calls ops.mulScalar(a, 10)
+----
+
+[[5-summary-table-skainet-vs-numpy]]
+=== 5. Summary Table: SKaiNET vs NumPy
+
+[cols="<,<,<",options="header",]
+|===
+|Feature |NumPy |SKaiNET
+|*Primary Type* |`ndarray` |`Tensor++<++T, V++>++`
+|*Creation* |`np.array(++[++1, 2, 3++]++)` |`tensor(ctx, FP32::class) ++{++ shape(3) ++{++ from(1f, 2f, 3f) } }`
+|*Zeros* |`np.zeros((2, 2))` |`ctx.zeros(Shape(2, 2), FP32::class)`
+|*Slicing* |`a++[++0:5, :++]++` |`a.sliceView ++{++ segment ++{++ range(0, 5) }; segment ++{++ all() } }`
+|*Matmul* |`a @ b` or `np.matmul(a, b)` |`ctx.ops.matmul(a, b)`
+|*Reshape* |`a.reshape(new++_++shape)` |`ctx.ops.reshape(a, Shape(new++_++shape))`
+|===
+
+[[6-best-practices-for-ai-integration]]
+=== 6. Best Practices for AI Integration
+
+[arabic]
+. *Context Awareness*: Always pass the `ExecutionContext` to functions that create or manipulate tensors.
+. *Type Safety*: Prefer specific `DType` classes (e.g., `FP32::class`, `Int32::class`) to avoid runtime errors.
+. *Views over Copies*: Use `sliceView` whenever possible to minimize memory overhead and improve performance.
+. *Backend Agnostic*: Write logic against the `TensorOps` interface to ensure your code runs on any supported backend.
diff --git a/docs/theory/index.adoc b/docs/modules/ROOT/pages/explanation/theory/index.adoc
similarity index 80%
rename from docs/theory/index.adoc
rename to docs/modules/ROOT/pages/explanation/theory/index.adoc
index e82bd082..80917802 100644
--- a/docs/theory/index.adoc
+++ b/docs/modules/ROOT/pages/explanation/theory/index.adoc
@@ -5,10 +5,6 @@ This section contains mathematical definitions and theoretical foundations for S
 [#operator-theory]
 == Operator Theory
 
-=== Architecture
-
-include::composite-ops.adoc[leveloffset=+2]
-
 === Linear Algebra Operations
 
 include::matmul.adoc[leveloffset=+2]
@@ -36,5 +32,5 @@ include::matmul.adoc[leveloffset=+2]
 [#cross-references]
 == Cross-References
 
-* xref:../examples/index.adoc[Usage Examples]
-* xref:../modules/operators/_generated_/index.adoc[Generated API Reference]
\ No newline at end of file
+* xref:explanation/examples/index.adoc[Usage Examples]
+// Operator reference lands in a later commit of the Antora migration.
\ No newline at end of file
diff --git a/docs/theory/matmul.adoc b/docs/modules/ROOT/pages/explanation/theory/matmul.adoc
similarity index 100%
rename from docs/theory/matmul.adoc
rename to docs/modules/ROOT/pages/explanation/theory/matmul.adoc
diff --git a/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc b/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc
new file mode 100644
index 00000000..7ef1165c
--- /dev/null
+++ b/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc
@@ -0,0 +1,95 @@
+== Arduino C Code Generation
+
+SKaiNET provides a specialized compiler backend for exporting trained neural networks to highly optimized, standalone C99 code suitable for microcontrollers like Arduino.
+
+=== Overview
+
+The Arduino C code generation process transforms a high-level Kotlin model into a memory-efficient C implementation. It prioritizes static memory allocation, minimal overhead, and numerical consistency with the original model.
+
+==== Codegen Pipeline
+
+[mermaid]
+----
+graph TD
+    A[Kotlin Model] --> B[Recording Pass]
+    B --> C[Execution Tape]
+    C --> D[Compute Graph]
+    D --> E[Graph Validation]
+    E --> F[Memory Layout Calculation]
+    F --> G[C Code Emission]
+    G --> H[Arduino Library Packaging]
+    H --> I[Generated .h/.c files]
+----
+
+=== Technical Deep Dive
+
+[[1-tape-based-tracing]]
+==== 1. Tape-based Tracing
+
+Instead of static analysis of the Kotlin code, SKaiNET uses a dynamic tracing mechanism. When you call `exportToArduinoLibrary`, the framework executes a single forward pass of your model using a specialized `RecordingContext`.
+
+* Every operation (Dense, ReLU, etc.) is recorded onto an *Execution Tape*.
+* This approach handles Kotlin's language features (loops, conditionals) naturally, as it only records the actual operations that were executed.
+
+[[2-compute-graph-construction]]
+==== 2. Compute Graph Construction
+
+The execution tape is converted into a directed acyclic graph (DAG) called `ComputeGraph`.
+
+* Nodes represent operations (Ops).
+* Edges represent data flow (Tensors).
+* During this phase, the compiler performs *Shape Inference* to ensure every tensor has a fixed, known size.
+
+[[3-static-memory-management]]
+==== 3. Static Memory Management
+
+Microcontrollers typically have very limited RAM and lack robust heap management. SKaiNET uses a *Ping-Pong Buffer Strategy* to eliminate dynamic memory allocation (`malloc`/`free`) during inference.
+
+===== Ping-Pong Buffer Strategy
+
+The compiler calculates the maximum size required for any intermediate tensor in the graph and allocates exactly two static buffers of that size.
+
+[mermaid]
+----
+sequenceDiagram
+    participant I as Input
+    participant B1 as Buffer A
+    participant B2 as Buffer B
+    participant O as Output
+    
+    I->>B1: Layer 1 (Input -> A)
+    B1->>B2: Layer 2 (A -> B)
+    B2->>B1: Layer 3 (B -> A)
+    B1->>O: Layer 4 (A -> Output)
+----
+
+* *Buffer Reuse*: Instead of allocating space for every layer's output, buffers are reused.
+* *Direct Output Optimization*: The first layer reads from the input pointer, and the last layer writes directly to the output pointer, avoiding unnecessary copies.
+
+[[4-code-generation-emission]]
+==== 4. Code Generation (Emission)
+
+The `CCodeGenerator` emits C99-compatible code using templates.
+
+* *Weights & Biases*: Extracted from the trained Kotlin model and serialized as `static const float` arrays. This places them in Flash memory (PROGMEM) on many microcontrollers, saving precious RAM.
+* *Kernel Implementation*: Operations like `Dense` (Linear) are implemented as optimized nested loops.
+* *Header Generation*: Produces a clean API for the user:
++
+[source,c]
+----
+int model_inference(const float* input, float* output);
+----
+
+[[5-validation]]
+==== 5. Validation
+
+The generator performs post-generation validation:
+
+* *Static Allocation Check*: Ensures no dynamic allocation is present in the generated source.
+* *Buffer Alternation Check*: Verifies that the ping-pong strategy is correctly implemented without data races or overwrites.
+
+=== Performance and Constraints
+
+* *Floating Point*: Currently optimized for `FP32`.
+* *Supported Ops*: `Dense`, `ReLU`, `Sigmoid`, `Tanh`, `Add`, `MatMul`.
+* *Memory*: Total memory consumption is `TotalWeights {plus} 2 ++*++ MaxIntermediateTensor`.
diff --git a/docs/modules/ROOT/pages/how-to/build.adoc b/docs/modules/ROOT/pages/how-to/build.adoc
new file mode 100644
index 00000000..c3a6b6d6
--- /dev/null
+++ b/docs/modules/ROOT/pages/how-to/build.adoc
@@ -0,0 +1,87 @@
+== Build Help
+
+=== Dokka API Documentation
+
+SKaiNET uses https://github.com/Kotlin/dokka[Dokka 2.1.0] to generate API reference documentation across all public library modules. A shared convention plugin (`sk.ainet.dokka`) standardises the configuration.
+
+==== Generating docs locally
+
+*Single module:*
+
+[source,bash]
+----
+./gradlew :skainet-lang:skainet-lang-core:dokkaGeneratePublicationHtml
+----
+
+Output: `skainet-lang/skainet-lang-core/build/dokka/html/`
+
+*Aggregated (all modules):*
+
+[source,bash]
+----
+./gradlew dokkaGenerate
+----
+
+Output: `build/dokka/html/index.html`
+
+==== Convention plugin details
+
+The `sk.ainet.dokka` precompiled script plugin (`build-logic/convention/src/main/kotlin/sk.ainet.dokka.gradle.kts`) applies `org.jetbrains.dokka` and configures:
+
+* *moduleName* from `project.name`
+* *moduleVersion* from the `VERSION++_++NAME` Gradle property
+* *Documented visibilities:* public only
+* *Suppressed generated files:* KSP-generated code is excluded
+* *Suppressed native source sets:* `iosArm64Main`, `iosSimulatorArm64Main`, `macosArm64Main`, `linuxX64Main`, `linuxArm64Main` are suppressed because Dokka 2.x cannot translate native cinterop symbols
+* *Source links* pointing to the GitHub repository
+
+==== Modules with Dokka enabled
+
+The plugin is applied to 21 library modules:
+
+[cols=",",options="header",]
+|===
+|Group |Modules
+|skainet-lang |`skainet-lang-core`, `skainet-lang-models`, `skainet-lang-ksp-annotations`, `skainet-lang-dag`
+|skainet-compile |`skainet-compile-core`, `skainet-compile-dag`, `skainet-compile-json`, `skainet-compile-hlo`, `skainet-compile-c`
+|skainet-backends |`skainet-backend-cpu`
+|skainet-data |`skainet-data-api`, `skainet-data-transform`, `skainet-data-simple`, `skainet-data-media`
+|skainet-io |`skainet-io-core`, `skainet-io-gguf`, `skainet-io-image`, `skainet-io-onnx`, `skainet-io-safetensors`
+|Other |`skainet-pipeline`, `skainet-model-yolo`
+|===
+
+*Excluded:* `skainet-bom` (no source), `skainet-apps/++*++`, `skainet-test/++*++`, benchmarks, and `skainet-lang-ksp-processor` (internal).
+
+==== Root-level aggregation
+
+The root `build.gradle.kts` applies the Dokka plugin directly (not `apply false`) and declares `dokka(project(...))` dependencies for all 21 modules. Running `./gradlew dokkaGenerate` at the root produces a unified API reference that includes every module under a single `SKaiNET` namespace. The root `README.md` is included as the landing page.
+
+==== KSP interaction
+
+`skainet-lang-core` and `skainet-lang-dag` use KSP to generate source code. Their build files include:
+
+[source,kotlin]
+----
+tasks.matching { it.name.startsWith("dokka") }.configureEach {
+    dependsOn("kspCommonMainKotlinMetadata")
+}
+----
+
+This ensures KSP-generated sources are available before Dokka runs.
+
+==== GitHub Pages deployment
+
+The workflow `.github/workflows/dokka-pages.yml` runs on push to `main` (and manually via `workflow++_++dispatch`). It:
+
+[arabic]
+. Checks out the repo
+. Sets up JDK 25
+. Runs `./gradlew dokkaGenerate`
+. Uploads the `build/dokka/html` directory as a Pages artifact
+. Deploys to GitHub Pages using `actions/deploy-pages@v4`
+
+*Prerequisite:* The repository must have Pages configured to deploy from GitHub Actions (Settings ++>++ Pages ++>++ Source: "GitHub Actions").
+
+==== Operator docs (unchanged)
+
+The existing operator documentation pipeline (`./gradlew generateDocs`) is unrelated to Dokka and continues to work as before.
diff --git a/docs/io-readers-guide.md b/docs/modules/ROOT/pages/how-to/io-readers.adoc
similarity index 88%
rename from docs/io-readers-guide.md
rename to docs/modules/ROOT/pages/how-to/io-readers.adoc
index d431a7c3..1f4b18da 100644
--- a/docs/io-readers-guide.md
+++ b/docs/modules/ROOT/pages/how-to/io-readers.adoc
@@ -1,48 +1,54 @@
-# SKaiNET I/O Readers Guide
+== SKaiNET I/O Readers Guide
 
 This guide demonstrates how to use SKaiNET's GGUF and ONNX readers in your Kotlin Multiplatform projects.
 
-## Overview
+=== Overview
 
 SKaiNET provides two main I/O modules for reading AI model formats:
-- **skainet-io-gguf**: For reading GGUF (GPT-Generated Unified Format) files
-- **skainet-io-onnx**: For reading ONNX (Open Neural Network Exchange) files
+
+* *skainet-io-gguf*: For reading GGUF (GPT-Generated Unified Format) files
+* *skainet-io-onnx*: For reading ONNX (Open Neural Network Exchange) files
 
 Both modules are built on Kotlin Multiplatform and support JVM, Android, iOS, JS, WASM, and Native platforms.
 
-## Dependencies
+=== Dependencies
 
 Add the following dependencies to your `build.gradle.kts`:
 
-### For GGUF Support
+==== For GGUF Support
 
-```kotlin
+[source,kotlin]
+----
 dependencies {
     implementation("sk.ainet.core:skainet-io-gguf:0.5.0")
     implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2")
 }
-```
+----
 
-### For ONNX Support
+==== For ONNX Support
 
-```kotlin
+[source,kotlin]
+----
 dependencies {
     implementation("sk.ainet.core:skainet-io-onnx:0.5.0")
     implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2")
     implementation("pro.streem.pbandk:pbandk-runtime:0.16.0")
 }
-```
+----
 
-## GGUF Reader Usage
+=== GGUF Reader Usage
 
-> **Recommended:** For large model files, use `StreamingGGUFReader` instead of `GGUFReader`.
-> The streaming reader parses only metadata (~1 MB) and loads tensors on-demand, supporting
-> files over 100 GB without heap-loading the entire file. It also supports quantized types
-> (Q4_K, Q8_0, etc.) via `StreamingGgufParametersLoader`. See the streaming examples below.
+____
+*Recommended:* For large model files, use `StreamingGGUFReader` instead of `GGUFReader`.
+The streaming reader parses only metadata (~1 MB) and loads tensors on-demand, supporting
+files over 100 GB without heap-loading the entire file. It also supports quantized types
+(Q4++_++K, Q8++_++0, etc.) via `StreamingGgufParametersLoader`. See the streaming examples below.
+____
 
-### Streaming GGUF Reading (Recommended)
+==== Streaming GGUF Reading (Recommended)
 
-```kotlin
+[source,kotlin]
+----
 import sk.ainet.io.JvmRandomAccessSource
 import sk.ainet.io.gguf.StreamingGGUFReader
 
@@ -60,16 +66,19 @@ fun readLargeModel(filePath: String) {
         println("Encoding: ${storage.encoding.name}, Physical: ${storage.physicalBytes} bytes")
     }
 }
-```
+----
 
-### Legacy GGUF Reading
+==== Legacy GGUF Reading
 
-> **Note:** The legacy `GGUFReader` loads the entire file into memory and only supports
-> F32/I32 tensors. Prefer `StreamingGGUFReader` for new code.
+____
+*Note:* The legacy `GGUFReader` loads the entire file into memory and only supports
+F32/I32 tensors. Prefer `StreamingGGUFReader` for new code.
+____
 
-### Basic GGUF Reading
+==== Basic GGUF Reading
 
-```kotlin
+[source,kotlin]
+----
 import kotlinx.io.Source
 import kotlinx.io.asSource
 import kotlinx.io.buffered
@@ -114,11 +123,12 @@ suspend fun readGGUFModel(filePath: String) {
         }
     }
 }
-```
+----
 
-### Working with Tensor Data
+==== Working with Tensor Data
 
-```kotlin
+[source,kotlin]
+----
 import sk.ainet.io.gguf.GGUFReader
 import sk.ainet.io.gguf.GGMLQuantizationType
 
@@ -155,11 +165,12 @@ fun processTensorData(reader: GGUFReader) {
         }
     }
 }
-```
+----
 
-### Lazy Loading for Large Models
+==== Lazy Loading for Large Models
 
-```kotlin
+[source,kotlin]
+----
 import sk.ainet.io.gguf.GGUFReader
 
 fun readGGUFMetadataOnly(filePath: String) {
@@ -184,13 +195,14 @@ fun readGGUFMetadataOnly(filePath: String) {
         }
     }
 }
-```
+----
 
-## ONNX Reader Usage
+=== ONNX Reader Usage
 
-### Basic ONNX Reading
+==== Basic ONNX Reading
 
-```kotlin
+[source,kotlin]
+----
 import kotlinx.io.Source
 import kotlinx.io.asSource
 import sk.ainet.io.onnx.OnnxLoader
@@ -229,11 +241,12 @@ suspend fun readONNXModel(filePath: String) {
         println("  Outputs: ${graph.output.size}")
     }
 }
-```
+----
 
-### Working with ONNX Graph Structure
+==== Working with ONNX Graph Structure
 
-```kotlin
+[source,kotlin]
+----
 import onnx.ModelProto
 import onnx.NodeProto
 import onnx.TensorProto
@@ -286,11 +299,12 @@ fun getAttributeValue(attr: onnx.AttributeProto): String {
 fun getTensorShapeString(tensor: TensorProto): String {
     return tensor.dims.joinToString("x") { it.toString() }
 }
-```
+----
 
-### Custom ONNX Loader with Error Handling
+==== Custom ONNX Loader with Error Handling
 
-```kotlin
+[source,kotlin]
+----
 import kotlinx.io.Source
 import sk.ainet.io.onnx.OnnxLoader
 import sk.ainet.io.onnx.OnnxLoadedModel
@@ -352,13 +366,14 @@ suspend fun safeLoadOnnx(filePath: String) {
             println("Failed to load ONNX model: ${error.message}")
         }
 }
-```
+----
 
-## Platform-Specific Considerations
+=== Platform-Specific Considerations
 
-### JVM Platform
+==== JVM Platform
 
-```kotlin
+[source,kotlin]
+----
 // JVM-specific file reading
 import java.io.File
 import java.nio.file.Path
@@ -366,11 +381,12 @@ import java.nio.file.Path
 fun readFromFile(path: Path): Source {
     return path.toFile().inputStream().asSource().buffered()
 }
-```
+----
 
-### Android Platform
+==== Android Platform
 
-```kotlin
+[source,kotlin]
+----
 // Android-specific asset reading
 import android.content.Context
 import android.content.res.AssetManager
@@ -378,11 +394,12 @@ import android.content.res.AssetManager
 fun readFromAssets(context: Context, fileName: String): Source {
     return context.assets.open(fileName).asSource().buffered()
 }
-```
+----
 
-### iOS/Native Platform
+==== iOS/Native Platform
 
-```kotlin
+[source,kotlin]
+----
 // Native platform file reading
 import kotlinx.io.files.Path
 import kotlinx.io.files.SystemFileSystem
@@ -391,13 +408,14 @@ fun readFromNativePath(pathString: String): Source {
     val path = Path(pathString)
     return SystemFileSystem.source(path).buffered()
 }
-```
+----
 
-## Performance Tips
+=== Performance Tips
 
-### Memory Management
+==== Memory Management
 
-```kotlin
+[source,kotlin]
+----
 // For large models, consider streaming or chunked processing
 fun processLargeModel(reader: GGUFReader) {
     // Process tensors one at a time to manage memory
@@ -411,11 +429,12 @@ fun processLargeModel(reader: GGUFReader) {
         }
     }
 }
-```
+----
 
-### Lazy Loading Strategy
+==== Lazy Loading Strategy
 
-```kotlin
+[source,kotlin]
+----
 class ModelManager {
     private var reader: GGUFReader? = null
     private val tensorCache = mutableMapOf<String, List<Any>>()
@@ -431,11 +450,12 @@ class ModelManager {
         }
     }
 }
-```
+----
 
-## Error Handling Best Practices
+=== Error Handling Best Practices
 
-```kotlin
+[source,kotlin]
+----
 sealed class ModelLoadResult<T> {
     data class Success<T>(val model: T) : ModelLoadResult<T>()
     data class Error<T>(val message: String, val cause: Throwable? = null) : ModelLoadResult<T>()
@@ -459,13 +479,14 @@ suspend fun loadModelSafely(filePath: String): ModelLoadResult<GGUFReader> {
         ModelLoadResult.Error("Failed to load model: ${e.message}", e)
     }
 }
-```
+----
 
-## Integration Examples
+=== Integration Examples
 
-### Using with Coroutines
+==== Using with Coroutines
 
-```kotlin
+[source,kotlin]
+----
 import kotlinx.coroutines.*
 
 class AsyncModelLoader {
@@ -495,6 +516,6 @@ class AsyncModelLoader {
 }
 
 data class ProcessedTensor(val name: String, val size: Int)
-```
+----
 
-This guide provides comprehensive examples for using SKaiNET's I/O readers in your projects. The readers are designed to be efficient, multiplatform-compatible, and easy to integrate into existing Kotlin applications.
\ No newline at end of file
+This guide provides comprehensive examples for using SKaiNET's I/O readers in your projects. The readers are designed to be efficient, multiplatform-compatible, and easy to integrate into existing Kotlin applications.
diff --git a/docs/java-cli-app.md b/docs/modules/ROOT/pages/how-to/java-cli-app.adoc
similarity index 85%
rename from docs/java-cli-app.md
rename to docs/modules/ROOT/pages/how-to/java-cli-app.adoc
index c2288a5d..a233942d 100644
--- a/docs/java-cli-app.md
+++ b/docs/modules/ROOT/pages/how-to/java-cli-app.adoc
@@ -1,22 +1,23 @@
-# Building a Java CLI App with KLlama
+== Building a Java CLI App with KLlama
 
-This guide walks you through creating a standalone Java 21+ command-line application that loads a LLaMA model and generates text using the KLlama library.
+This guide walks you through creating a standalone Java 21{plus} command-line application that loads a LLaMA model and generates text using the KLlama library.
 
-## Prerequisites
+=== Prerequisites
 
-- **JDK 21 or later** (required for Vector API and virtual threads)
-- **Maven 3.8+** or **Gradle 8.4+**
-- A GGUF model file (e.g., [TinyLlama-1.1B-Chat GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF))
+* *JDK 21 or later* (required for Vector API and virtual threads)
+* *Maven 3.8{plus}* or *Gradle 8.4{plus}*
+* A GGUF model file (e.g., https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF[TinyLlama-1.1B-Chat GGUF])
 
----
+'''''
 
-## Project Setup
+=== Project Setup
 
-### Maven
+==== Maven
 
 Create a `pom.xml`:
 
-```xml
+[source,xml]
+----
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
@@ -112,13 +113,14 @@ Create a `pom.xml`:
         </plugins>
     </build>
 </project>
-```
+----
 
-### Gradle
+==== Gradle
 
 Create a `build.gradle` (Groovy DSL):
 
-```groovy
+[source,groovy]
+----
 plugins {
     id 'java'
     id 'application'
@@ -151,15 +153,16 @@ application {
 tasks.withType(JavaCompile).configureEach {
     options.compilerArgs.add('--enable-preview')
 }
-```
+----
 
----
+'''''
 
-## Source Code
+=== Source Code
 
 Create `src/main/java/com/example/KLlamaCli.java`:
 
-```java
+[source,java]
+----
 package com.example;
 
 import sk.ainet.apps.kllama.java.GenerationConfig;
@@ -194,15 +197,16 @@ public class KLlamaCli {
         }
     }
 }
-```
+----
 
----
+'''''
 
-## Building and Running
+=== Building and Running
 
-### With Maven
+==== With Maven
 
-```bash
+[source,bash]
+----
 # Run directly
 mvn compile exec:java -Dexec.args="model.gguf 'Once upon a time' 128 0.7"
 
@@ -213,11 +217,12 @@ mvn package
 java --enable-preview --add-modules jdk.incubator.vector \
      -jar target/kllama-cli-1.0-SNAPSHOT.jar \
      model.gguf "Once upon a time" 128 0.7
-```
+----
 
-### With Gradle
+==== With Gradle
 
-```bash
+[source,bash]
+----
 # Run directly
 ./gradlew run --args="model.gguf 'Once upon a time' 128 0.7"
 
@@ -227,28 +232,30 @@ java --enable-preview --add-modules jdk.incubator.vector \
 # Run from distribution
 ./build/install/kllama-cli/bin/kllama-cli \
      model.gguf "Once upon a time" 128 0.7
-```
+----
 
----
+'''''
 
-## Loading SafeTensors Models
+=== Loading SafeTensors Models
 
 To load a HuggingFace model directory instead of GGUF, use `loadSafeTensors` and point to the directory containing `model.safetensors`, `config.json`, and `tokenizer.json`:
 
-```java
+[source,java]
+----
 try (KLlamaSession session = KLlamaJava.loadSafeTensors(Path.of("./my-llama-model/"))) {
     session.generate("Hello", config, token -> System.out.print(token));
     System.out.println();
 }
-```
+----
 
----
+'''''
 
-## Async Generation
+=== Async Generation
 
 Use `generateAsync` to run generation on a virtual thread and get a `CompletableFuture`:
 
-```java
+[source,java]
+----
 import java.util.concurrent.CompletableFuture;
 
 try (KLlamaSession session = KLlamaJava.loadGGUF(modelPath)) {
@@ -262,20 +269,21 @@ try (KLlamaSession session = KLlamaJava.loadGGUF(modelPath)) {
     String result = future.join();
     System.out.println(result);
 }
-```
+----
 
 You can also compose futures:
 
-```java
+[source,java]
+----
 session.generateAsync("Translate to French: Hello world")
        .thenAccept(translation -> System.out.println("Translation: " + translation))
        .exceptionally(ex -> { ex.printStackTrace(); return null; });
-```
+----
 
----
+'''''
 
-## Next Steps
+=== Next Steps
 
-- [Java LLM Inference Guide](java-llm-inference.md) — BERT embeddings, agent/tool-calling, and more.
-- [Java Getting Started](java-getting-started.md) — tensor operations, full Maven/Gradle setup.
-- [KLlama Library](../skainet-apps/skainet-kllama/README.md) — custom backends and Kotlin embedding.
+* link:java-llm-inference.md[Java LLM Inference Guide] — BERT embeddings, agent/tool-calling, and more.
+* link:java-getting-started.md[Java Getting Started] — tensor operations, full Maven/Gradle setup.
+* link:../skainet-apps/skainet-kllama/README.md[KLlama Library] — custom backends and Kotlin embedding.
diff --git a/docs/java-llm-inference.md b/docs/modules/ROOT/pages/how-to/java-llm-inference.adoc
similarity index 78%
rename from docs/java-llm-inference.md
rename to docs/modules/ROOT/pages/how-to/java-llm-inference.adoc
index feb94244..567b9aa1 100644
--- a/docs/java-llm-inference.md
+++ b/docs/modules/ROOT/pages/how-to/java-llm-inference.adoc
@@ -1,15 +1,16 @@
-# Java LLM Inference Guide
+== Java LLM Inference Guide
 
 This guide covers loading and running large language models (LLaMA, BERT) from Java using SKaiNET's blocking, streaming, and async APIs.
 
-## Prerequisites
+=== Prerequisites
 
-- JDK 21+ with `--enable-preview --add-modules jdk.incubator.vector`
-- See [Java Getting Started](java-getting-started.md) for project setup
+* JDK 21{plus} with `--enable-preview --add-modules jdk.incubator.vector`
+* See link:java-getting-started.md[Java Getting Started] for project setup
 
-### Maven Dependencies
+==== Maven Dependencies
 
-```xml
+[source,xml]
+----
 <dependencyManagement>
     <dependencies>
         <dependency>
@@ -47,19 +48,20 @@ This guide covers loading and running large language models (LLaMA, BERT) from J
         <artifactId>skainet-backend-cpu-jvm</artifactId>
     </dependency>
 </dependencies>
-```
+----
 
----
+'''''
 
-## LLaMA Inference
+=== LLaMA Inference
 
 All LLaMA Java classes live in `sk.ainet.apps.kllama.java`.
 
-### Loading a GGUF Model
+==== Loading a GGUF Model
 
 The simplest way to get started is to load a GGUF file. `KLlamaJava.loadGGUF()` handles context creation, weight loading, quantization dispatch, and tokenizer setup behind the scenes.
 
-```java
+[source,java]
+----
 import sk.ainet.apps.kllama.java.KLlamaJava;
 import sk.ainet.apps.kllama.java.KLlamaSession;
 import sk.ainet.apps.kllama.java.GenerationConfig;
@@ -73,50 +75,54 @@ public class LlamaExample {
         }
     }
 }
-```
+----
 
 `KLlamaSession` implements `AutoCloseable`, so `try-with-resources` properly releases the off-heap memory arenas when you are done.
 
-### Loading SafeTensors (HuggingFace Format)
+==== Loading SafeTensors (HuggingFace Format)
 
 If you have a HuggingFace model directory containing `model.safetensors`, `config.json`, and `tokenizer.json`:
 
-```java
+[source,java]
+----
 try (KLlamaSession session = KLlamaJava.loadSafeTensors(Path.of("./my-llama-model/"))) {
     String response = session.generate("Once upon a time");
     System.out.println(response);
 }
-```
+----
 
 The directory must contain:
-- `model.safetensors` -- the model weights
-- `config.json` -- model architecture config (hidden size, layers, heads, etc.)
-- `tokenizer.json` -- HuggingFace tokenizer definition
 
----
+* `model.safetensors` -- the model weights
+* `config.json` -- model architecture config (hidden size, layers, heads, etc.)
+* `tokenizer.json` -- HuggingFace tokenizer definition
 
-## GenerationConfig
+'''''
+
+=== GenerationConfig
 
 Control generation parameters with the builder pattern:
 
-```java
+[source,java]
+----
 GenerationConfig config = GenerationConfig.builder()
         .maxTokens(256)       // maximum tokens to generate (default: 256)
         .temperature(0.7f)    // sampling temperature (default: 0.8)
         .build();
 
 String response = session.generate("Explain quantum computing", config);
-```
+----
 
 Use `GenerationConfig.defaults()` for the default configuration (256 max tokens, 0.8 temperature).
 
----
+'''''
 
-## Streaming Generation
+=== Streaming Generation
 
-Pass a `Consumer<String>` to receive each token as it is generated. This is useful for displaying output in real time:
+Pass a `Consumer++<++String++>++` to receive each token as it is generated. This is useful for displaying output in real time:
 
-```java
+[source,java]
+----
 GenerationConfig config = GenerationConfig.builder()
         .maxTokens(512)
         .temperature(0.9f)
@@ -129,17 +135,18 @@ String fullResponse = session.generate(
 );
 
 System.out.println();  // newline after streaming
-```
+----
 
-The `generate` overload with a `Consumer<String>` still returns the complete generated text as its return value.
+The `generate` overload with a `Consumer++<++String++>++` still returns the complete generated text as its return value.
 
----
+'''''
 
-## Async Generation
+=== Async Generation
 
-`generateAsync` offloads generation to a virtual thread and returns a `CompletableFuture<String>`:
+`generateAsync` offloads generation to a virtual thread and returns a `CompletableFuture++<++String++>++`:
 
-```java
+[source,java]
+----
 import java.util.concurrent.CompletableFuture;
 
 CompletableFuture<String> future = session.generateAsync(
@@ -150,27 +157,29 @@ CompletableFuture<String> future = session.generateAsync(
 // Do other work while generation runs...
 String result = future.join();  // block when you need the result
 System.out.println(result);
-```
+----
 
 You can also compose futures:
 
-```java
+[source,java]
+----
 session.generateAsync("Translate to French: Hello world")
        .thenAccept(translation -> System.out.println("Translation: " + translation))
        .exceptionally(ex -> { ex.printStackTrace(); return null; });
-```
+----
 
----
+'''''
 
-## BERT Encoding and Similarity
+=== BERT Encoding and Similarity
 
 All BERT Java classes live in `sk.ainet.apps.bert.java`.
 
-### Loading a BERT Model
+==== Loading a BERT Model
 
 Load a BERT model from a HuggingFace directory containing `model.safetensors` and `vocab.txt`:
 
-```java
+[source,java]
+----
 import sk.ainet.apps.bert.java.KBertJava;
 import sk.ainet.apps.bert.java.KBertSession;
 import java.nio.file.Path;
@@ -180,18 +189,20 @@ try (KBertSession bert = KBertJava.loadSafeTensors(Path.of("./bert-base-uncased/
     float[] embedding = bert.encode("SKaiNET is a tensor framework");
     System.out.println("Embedding dimension: " + embedding.length);
 }
-```
+----
 
 The directory must contain:
-- `model.safetensors` -- BERT model weights
-- `vocab.txt` -- WordPiece vocabulary
-- `config.json` (optional) -- model config; defaults are used if absent
 
-### Similarity Scoring
+* `model.safetensors` -- BERT model weights
+* `vocab.txt` -- WordPiece vocabulary
+* `config.json` (optional) -- model config; defaults are used if absent
+
+==== Similarity Scoring
 
 Compute cosine similarity between two texts directly:
 
-```java
+[source,java]
+----
 try (KBertSession bert = KBertJava.loadSafeTensors(Path.of("./bert-base-uncased/"))) {
     float score = bert.similarity(
             "The cat sat on the mat",
@@ -206,21 +217,22 @@ try (KBertSession bert = KBertJava.loadSafeTensors(Path.of("./bert-base-uncased/
     );
     System.out.printf("Unrelated:  %.4f%n", low);    // e.g. 0.1247
 }
-```
+----
 
-The returned value is cosine similarity in the range [-1, 1].
+The returned value is cosine similarity in the range ++[++-1, 1++]++.
 
----
+'''''
 
-## Agent Loop and Tool Calling
+=== Agent Loop and Tool Calling
 
 All agent/tool classes live in `sk.ainet.apps.kllama.chat.java`.
 
 The `JavaAgentLoop` lets the LLM call tools in a loop until it produces a final answer. You define tools by implementing the `JavaTool` interface.
 
-### Defining a Tool
+==== Defining a Tool
 
-```java
+[source,java]
+----
 import sk.ainet.apps.kllama.chat.java.JavaTool;
 import sk.ainet.apps.kllama.chat.ToolDefinition;
 import java.util.Map;
@@ -255,11 +267,12 @@ public class CalculatorTool implements JavaTool {
         return 0.0;
     }
 }
-```
+----
 
-### Building and Using the Agent
+==== Building and Using the Agent
 
-```java
+[source,java]
+----
 import sk.ainet.apps.kllama.java.KLlamaJava;
 import sk.ainet.apps.kllama.java.KLlamaSession;
 import sk.ainet.apps.kllama.chat.java.JavaAgentLoop;
@@ -285,24 +298,26 @@ try (KLlamaSession session = KLlamaJava.loadGGUF(Path.of("model.gguf"))) {
     // Reset conversation history (keeps system prompt)
     agent.reset();
 }
-```
+----
 
-### Streaming Agent Responses
+==== Streaming Agent Responses
 
-```java
+[source,java]
+----
 String answer = agent.chat(
         "What is the square root of 144?",
         token -> System.out.print(token)
 );
-```
+----
 
----
+'''''
 
-## Resource Management
+=== Resource Management
 
 Both `KLlamaSession` and `KBertSession` implement `AutoCloseable`. Always use `try-with-resources` to ensure off-heap memory arenas and other native resources are released promptly:
 
-```java
+[source,java]
+----
 // Single session
 try (KLlamaSession session = KLlamaJava.loadGGUF(path)) {
     session.generate("Hello");
@@ -315,23 +330,25 @@ try (KLlamaSession llama = KLlamaJava.loadGGUF(llamaPath);
     String text = llama.generate("Write a summary of quantum mechanics");
     float[] embedding = bert.encode(text);
 }
-```
+----
 
 Failing to close sessions will leak off-heap memory allocated via `java.lang.foreign.Arena`.
 
----
+'''''
 
-## Package Reference
+=== Package Reference
 
-| Package                                | Key Classes                                 |
-|----------------------------------------|---------------------------------------------|
-| `sk.ainet.apps.kllama.java`           | `KLlamaJava`, `KLlamaSession`, `GenerationConfig` |
-| `sk.ainet.apps.bert.java`             | `KBertJava`, `KBertSession`                 |
-| `sk.ainet.apps.kllama.chat.java`      | `JavaAgentLoop`, `JavaTool`                 |
+[cols=",",options="header",]
+|===
+|Package |Key Classes
+|`sk.ainet.apps.kllama.java` |`KLlamaJava`, `KLlamaSession`, `GenerationConfig`
+|`sk.ainet.apps.bert.java` |`KBertJava`, `KBertSession`
+|`sk.ainet.apps.kllama.chat.java` |`JavaAgentLoop`, `JavaTool`
+|===
 
----
+'''''
 
-## Next Steps
+=== Next Steps
 
-- [Java Getting Started](java-getting-started.md) -- tensor operations, project setup, and dependency management.
-- [Model Training Guide](java-model-training.md) -- build and train neural networks from Java.
+* link:java-getting-started.md[Java Getting Started] -- tensor operations, project setup, and dependency management.
+* link:java-model-training.md[Model Training Guide] -- build and train neural networks from Java.
diff --git a/docs/java-model-training.md b/docs/modules/ROOT/pages/how-to/java-model-training.adoc
similarity index 80%
rename from docs/java-model-training.md
rename to docs/modules/ROOT/pages/how-to/java-model-training.adoc
index 92e3e9cc..2abf7d17 100644
--- a/docs/java-model-training.md
+++ b/docs/modules/ROOT/pages/how-to/java-model-training.adoc
@@ -1,15 +1,16 @@
-# Java Model Training Guide
+== Java Model Training Guide
 
 This guide covers building neural networks, defining loss functions and optimizers, loading datasets, and running training loops -- all from plain Java.
 
-## Prerequisites
+=== Prerequisites
 
-- JDK 21+ with `--enable-preview --add-modules jdk.incubator.vector`
-- See [Java Getting Started](java-getting-started.md) for project setup
+* JDK 21{plus} with `--enable-preview --add-modules jdk.incubator.vector`
+* See link:java-getting-started.md[Java Getting Started] for project setup
 
-### Maven Dependencies
+==== Maven Dependencies
 
-```xml
+[source,xml]
+----
 <dependencyManagement>
     <dependencies>
         <dependency>
@@ -41,15 +42,16 @@ This guide covers building neural networks, defining loss functions and optimize
         <artifactId>skainet-data-simple-jvm</artifactId>
     </dependency>
 </dependencies>
-```
+----
 
----
+'''''
 
-## Building a Model with SequentialModelBuilder
+=== Building a Model with SequentialModelBuilder
 
 `SequentialModelBuilder` provides a fluent API for stacking dense layers and activations. It lives in `sk.ainet.java`.
 
-```java
+[source,java]
+----
 import sk.ainet.java.SKaiNET;
 import sk.ainet.java.SequentialModelBuilder;
 import sk.ainet.lang.nn.Module;
@@ -63,39 +65,43 @@ Module model = new SequentialModelBuilder(ctx)
         .relu()           // ReLU activation
         .dense(10)        // fully connected: 128 -> 10 (digit classes)
         .build();
-```
-
-### Available Layers and Activations
-
-| Method                  | Description                              |
-|-------------------------|------------------------------------------|
-| `.input(size)`          | Set the input dimension (must be first)  |
-| `.dense(outputSize)`    | Fully connected (linear) layer           |
-| `.relu()`               | ReLU activation: max(0, x)               |
-| `.sigmoid()`            | Sigmoid activation                       |
-| `.silu()`               | SiLU / Swish activation: x * sigmoid(x) |
-| `.gelu()`               | GELU activation                          |
-| `.softmax(dim)`         | Softmax along a dimension (default: -1)  |
-| `.flatten(start, end)`  | Flatten dimensions                       |
+----
+
+==== Available Layers and Activations
+
+[cols=",",options="header",]
+|===
+|Method |Description
+|`.input(size)` |Set the input dimension (must be first)
+|`.dense(outputSize)` |Fully connected (linear) layer
+|`.relu()` |ReLU activation: max(0, x)
+|`.sigmoid()` |Sigmoid activation
+|`.silu()` |SiLU / Swish activation: x ++*++ sigmoid(x)
+|`.gelu()` |GELU activation
+|`.softmax(dim)` |Softmax along a dimension (default: -1)
+|`.flatten(start, end)` |Flatten dimensions
+|===
 
 Weights are initialized using Xavier initialization. The data type defaults to FP32; pass a `DType` to the constructor to change it:
 
-```java
+[source,java]
+----
 Module model = new SequentialModelBuilder(ctx, DType.fp16())
         .input(784)
         .dense(256)
         .gelu()
         .dense(10)
         .build();
-```
+----
 
----
+'''''
 
-## Losses
+=== Losses
 
 The `Losses` factory (in `sk.ainet.java`) creates loss function instances:
 
-```java
+[source,java]
+----
 import sk.ainet.java.Losses;
 import sk.ainet.lang.nn.loss.Loss;
 
@@ -110,15 +116,16 @@ Loss hub  = Losses.huber(1.0f);                      // Huber / Smooth L1
 Loss hin  = Losses.hinge(1.0f);                      // hinge loss
 Loss shin = Losses.squaredHinge(1.0f);               // squared hinge
 Loss poi  = Losses.poisson();                        // Poisson NLL
-```
+----
 
----
+'''''
 
-## Optimizers
+=== Optimizers
 
 The `Optimizers` factory (in `sk.ainet.java`) creates optimizer instances:
 
-```java
+[source,java]
+----
 import sk.ainet.java.Optimizers;
 import sk.ainet.lang.nn.optim.Optimizer;
 
@@ -136,15 +143,16 @@ Optimizer sgd = Optimizers.sgd(0.01, 0.9);
 
 // SGD with momentum and weight decay
 Optimizer sgdWd = Optimizers.sgd(0.01, 0.9, 0.0001);
-```
+----
 
----
+'''''
 
-## TrainingLoop
+=== TrainingLoop
 
 `TrainingLoop` ties together a model, loss function, optimizer, and execution context. Build it with the static builder:
 
-```java
+[source,java]
+----
 import sk.ainet.java.TrainingLoop;
 
 TrainingLoop loop = TrainingLoop.builder()
@@ -153,22 +161,25 @@ TrainingLoop loop = TrainingLoop.builder()
         .optimizer(Optimizers.adam(0.001))
         .context(ctx)
         .build();
-```
+----
 
-### Single Training Step
+==== Single Training Step
 
 `step(x, y)` performs one forward pass, computes the loss, backpropagates, and updates weights. It returns the loss as a `float`:
 
-```java
+[source,java]
+----
 float loss = loop.step(inputBatch, targetBatch);
 System.out.printf("Step loss: %.4f%n", loss);
-```
+----
 
-### Full Training with `.train()`
+[[full-training-with-train]]
+==== Full Training with `.train()`
 
 `train()` accepts a `Supplier` that produces an `Iterator` of `(input, target)` pairs for each epoch:
 
-```java
+[source,java]
+----
 import sk.ainet.java.TrainingResult;
 import kotlin.Pair;
 
@@ -179,15 +190,17 @@ TrainingResult result = loop.train(
 
 System.out.printf("Trained %d epochs, final loss: %.4f%n",
         result.getEpochs(), result.getFinalLoss());
-```
+----
 
 Each call to the supplier should return a fresh iterator over the training batches for that epoch. This allows reshuffling between epochs.
 
-### Async Training with `.trainAsync()`
+[[async-training-with-trainasync]]
+==== Async Training with `.trainAsync()`
 
-`trainAsync()` runs the training loop on a virtual thread and returns a `CompletableFuture<TrainingResult>`:
+`trainAsync()` runs the training loop on a virtual thread and returns a `CompletableFuture++<++TrainingResult++>++`:
 
-```java
+[source,java]
+----
 import java.util.concurrent.CompletableFuture;
 
 CompletableFuture<TrainingResult> future = loop.trainAsync(
@@ -199,23 +212,25 @@ CompletableFuture<TrainingResult> future = loop.trainAsync(
 
 TrainingResult result = future.join();
 System.out.printf("Final loss: %.4f%n", result.getFinalLoss());
-```
+----
 
 You can also compose the future:
 
-```java
+[source,java]
+----
 loop.trainAsync(() -> batches.iterator(), 10)
     .thenAccept(r -> System.out.println("Done! Loss: " + r.getFinalLoss()))
     .exceptionally(ex -> { ex.printStackTrace(); return null; });
-```
+----
 
----
+'''''
 
-## Loading MNIST Data
+=== Loading MNIST Data
 
 The MNIST dataset loader lives in `sk.ainet.data.mnist`. The `MNISTBlocking` class provides blocking (non-suspend) methods for Java:
 
-```java
+[source,java]
+----
 import sk.ainet.data.mnist.MNISTBlocking;
 import sk.ainet.data.mnist.MNISTDataset;
 
@@ -225,34 +240,37 @@ MNISTDataset test  = MNISTBlocking.loadTest();
 
 System.out.println("Training samples: " + train.getImages().size());  // 60000
 System.out.println("Test samples:     " + test.getImages().size());   // 10000
-```
+----
 
 The first call downloads the dataset from the internet and caches it. Subsequent calls load from disk.
 
-### Custom Cache Directory
+==== Custom Cache Directory
 
-```java
+[source,java]
+----
 import sk.ainet.data.mnist.MNISTLoaderConfig;
 
 MNISTLoaderConfig config = new MNISTLoaderConfig("/tmp/my-mnist-cache", true);
 MNISTDataset train = MNISTBlocking.loadTrain(config);
-```
+----
 
-### Working with MNIST Data
+==== Working with MNIST Data
 
-Each `MNISTDataset` contains a list of `MNISTImage` objects. Each image has a `byte[]` of 784 pixels (28x28) and a `byte` label (0-9):
+Each `MNISTDataset` contains a list of `MNISTImage` objects. Each image has a `byte++[]++` of 784 pixels (28x28) and a `byte` label (0-9):
 
-```java
+[source,java]
+----
 var firstImage = train.getImages().get(0);
 byte label = firstImage.getLabel();       // e.g. 5
 byte[] pixels = firstImage.getImage();    // 784 bytes, 0-255
-```
+----
 
-### Creating Tensor Batches
+==== Creating Tensor Batches
 
 To feed MNIST data into the training loop, convert images to tensors:
 
-```java
+[source,java]
+----
 import sk.ainet.java.SKaiNET;
 import sk.ainet.lang.types.DType;
 import kotlin.Pair;
@@ -287,15 +305,16 @@ for (int i = 0; i < images.size(); i += batchSize) {
     var y = SKaiNET.tensor(ctx, new int[]{actual}, DType.fp32(), yData);
     batches.add(new Pair<>(x, y));
 }
-```
+----
 
----
+'''''
 
-## Complete MNIST Training Example
+=== Complete MNIST Training Example
 
 Putting it all together:
 
-```java
+[source,java]
+----
 package com.example;
 
 import sk.ainet.java.*;
@@ -382,23 +401,25 @@ public class MnistTraining {
         return batches;
     }
 }
-```
+----
 
 Run with:
 
-```bash
+[source,bash]
+----
 java --enable-preview --add-modules jdk.incubator.vector \
      -cp target/classes:target/dependency/* \
      com.example.MnistTraining
-```
+----
 
----
+'''''
 
-## Async Training Example
+=== Async Training Example
 
 For non-blocking training, use `trainAsync()` and handle the result with `CompletableFuture`:
 
-```java
+[source,java]
+----
 var future = loop.trainAsync(() -> (Iterator) batches.iterator(), 10);
 
 // Monitor progress or do other work
@@ -408,23 +429,25 @@ future.thenAccept(result -> {
     System.out.printf("Finished: %d epochs, loss %.4f%n",
             result.getEpochs(), result.getFinalLoss());
 }).join();
-```
+----
 
----
+'''''
 
-## Package Reference
+=== Package Reference
 
-| Package               | Key Classes                                          |
-|-----------------------|------------------------------------------------------|
-| `sk.ainet.java`      | `SKaiNET`, `SequentialModelBuilder`, `TrainingLoop`, `TrainingResult`, `Losses`, `Optimizers`, `TensorJavaOps` |
-| `sk.ainet.data.mnist` | `MNISTBlocking`, `MNISTDataset`, `MNISTImage`, `MNISTLoaderConfig` |
-| `sk.ainet.lang.types` | `DType`                                              |
-| `sk.ainet.lang.nn.loss` | `Loss` (interface returned by `Losses` factory)    |
-| `sk.ainet.lang.nn.optim` | `Optimizer` (interface returned by `Optimizers` factory) |
+[cols=",",options="header",]
+|===
+|Package |Key Classes
+|`sk.ainet.java` |`SKaiNET`, `SequentialModelBuilder`, `TrainingLoop`, `TrainingResult`, `Losses`, `Optimizers`, `TensorJavaOps`
+|`sk.ainet.data.mnist` |`MNISTBlocking`, `MNISTDataset`, `MNISTImage`, `MNISTLoaderConfig`
+|`sk.ainet.lang.types` |`DType`
+|`sk.ainet.lang.nn.loss` |`Loss` (interface returned by `Losses` factory)
+|`sk.ainet.lang.nn.optim` |`Optimizer` (interface returned by `Optimizers` factory)
+|===
 
----
+'''''
 
-## Next Steps
+=== Next Steps
 
-- [Java Getting Started](java-getting-started.md) -- tensor operations, project setup, and dependency management.
-- [LLM Inference Guide](java-llm-inference.md) -- load GGUF/SafeTensors models, generate text, and build agents.
+* link:java-getting-started.md[Java Getting Started] -- tensor operations, project setup, and dependency management.
+* link:java-llm-inference.md[LLM Inference Guide] -- load GGUF/SafeTensors models, generate text, and build agents.
diff --git a/docs/modules/ROOT/pages/reference/api.adoc b/docs/modules/ROOT/pages/reference/api.adoc
new file mode 100644
index 00000000..ba400d1f
--- /dev/null
+++ b/docs/modules/ROOT/pages/reference/api.adoc
@@ -0,0 +1,19 @@
+= API Reference
+:description: Kotlin API reference generated by Dokka.
+
+The full Kotlin API reference for every SKaiNET module is
+generated by https://kotlinlang.org/docs/dokka-introduction.html[Dokka]
+and published as a sibling path of this documentation site.
+
+link:../api/index.html[Open the Dokka API reference, window=_blank]
+
+[NOTE]
+====
+The Dokka output is bundled into the published site by a
+`bundleDokkaIntoSite` Gradle task that runs **after** Antora
+writes the site. When you preview the site locally via
+`docker run ... antora ... docs/antora-playbook.yml`, the
+`/api/` path does not yet exist — run
+`./gradlew bundleDokkaIntoSite` to populate it before clicking
+through.
+====
diff --git a/docs/modules/ROOT/pages/reference/architecture.adoc b/docs/modules/ROOT/pages/reference/architecture.adoc
new file mode 100644
index 00000000..d350b26f
--- /dev/null
+++ b/docs/modules/ROOT/pages/reference/architecture.adoc
@@ -0,0 +1,11 @@
+= Architecture
+:description: How SKaiNET's compile and execution layers are organized.
+
+SKaiNET uses a hybrid backend strategy that separates development
+iteration from production deployment.
+
+image::SKaiNET-compiler.svg[Architecture diagram of the SKaiNET compiler pipeline]
+
+// The original ARCHITECTURE.md at the repo root was a 4-line stub
+// pointing at the compiler diagram. If you are looking for a
+// deeper architecture write-up, contribute it as a PR to this page.
diff --git a/docs/graph-dsl.md b/docs/modules/ROOT/pages/tutorials/graph-dsl.adoc
similarity index 77%
rename from docs/graph-dsl.md
rename to docs/modules/ROOT/pages/tutorials/graph-dsl.adoc
index 8c65ac17..3112dfc5 100644
--- a/docs/graph-dsl.md
+++ b/docs/modules/ROOT/pages/tutorials/graph-dsl.adoc
@@ -1,12 +1,13 @@
-# SKaiNET Graph DSL
+== SKaiNET Graph DSL
 
 The SKaiNET Graph DSL provides a way to define complex directed acyclic graphs (DAGs) for machine learning models. Unlike the sequential `nn` DSL, the `dag` DSL allows for arbitrary wiring of nodes, multi-output graphs, and reusable modules.
 
-## Basic Usage
+=== Basic Usage
 
 To define a graph, use the `dag` block:
 
-```kotlin
+[source,kotlin]
+----
 val program = dag {
     val x = input<FP32>("input", TensorSpec("input", listOf(1, 3, 224, 224), "FP32"))
     
@@ -18,33 +19,35 @@ val program = dag {
     
     output(activated)
 }
-```
+----
 
-## Key Concepts
+=== Key Concepts
 
-### Inputs, Parameters, and Constants
+==== Inputs, Parameters, and Constants
 
-- `input<T>(name, spec)`: Defines an input node for the graph.
-- `parameter<T, V>(name) { ... }`: Defines a learnable parameter node. You can use a builder to specify shape and initialization.
-- `constant<T, V>(name) { ... }`: Defines a constant node (e.g., fixed biases or weights).
+* `input++<++T++>++(name, spec)`: Defines an input node for the graph.
+* `parameter++<++T, V++>++(name) ++{++ ... }`: Defines a learnable parameter node. You can use a builder to specify shape and initialization.
+* `constant++<++T, V++>++(name) ++{++ ... }`: Defines a constant node (e.g., fixed biases or weights).
 
-### Operations
+==== Operations
 
 Standard operations like `conv2d`, `relu`, `matmul`, `add`, etc., are available as extension functions within the `DagBuilder` (operations are in sync with TensorOps and implemented extention method via KSP).
 
-### Outputs
+==== Outputs
 
 A graph can have one or more outputs, defined using the `output()` function.
 
-```kotlin
+[source,kotlin]
+----
 output(branch1, branch2)
-```
+----
 
-## Reusable Modules
+=== Reusable Modules
 
 You can define reusable graph components using `dagModule`:
 
-```kotlin
+[source,kotlin]
+----
 val residualBlock = dagModule { inputs ->
     val x = inputs[0]
     val conv1 = conv2d(x, w1, b1, padding = 1 to 1)
@@ -59,25 +62,27 @@ val program = dag {
     val out = module(residualBlock, listOf(x))
     output(out[0])
 }
-```
+----
 
-## Compiling and Validating
+=== Compiling and Validating
 
 Once a `GraphProgram` is built, it can be converted to a `ComputeGraph` for execution or compilation:
 
-```kotlin
+[source,kotlin]
+----
 val graph = program.toComputeGraph()
 val validation = graph.validate()
 if (validation is ValidationResult.Valid) {
     // proceed to execution or compilation
 }
-```
+----
 
-## YOLO-style Example
+=== YOLO-style Example
 
 The Graph DSL is particularly useful for complex architectures like YOLO heads:
 
-```kotlin
+[source,kotlin]
+----
 val program = dag {
     val input = input<FP32>("input", TensorSpec("input", listOf(1, 3, 640, 640), "FP32"))
 
@@ -90,4 +95,4 @@ val program = dag {
 
     output(c2, head) // Multi-scale outputs
 }
-```
+----
diff --git a/docs/hlo-getting-started.md b/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc
similarity index 59%
rename from docs/hlo-getting-started.md
rename to docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc
index c0a116a4..d7d47a92 100644
--- a/docs/hlo-getting-started.md
+++ b/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc
@@ -1,34 +1,35 @@
-# Getting Started with HLO in SKaiNET
+== Getting Started with HLO in SKaiNET
 
-## What is HLO?
+=== What is HLO?
 
-HLO (High-Level Operations) is SKaiNET's intermediate representation for neural network computations, based on [StableHLO](https://github.com/openxla/stablehlo) - the portable high-level operation set for machine learning. HLO serves as a bridge between SKaiNET's Kotlin DSL and various execution backends, enabling optimizations and cross-platform deployment.
+HLO (High-Level Operations) is SKaiNET's intermediate representation for neural network computations, based on https://github.com/openxla/stablehlo[StableHLO] - the portable high-level operation set for machine learning. HLO serves as a bridge between SKaiNET's Kotlin DSL and various execution backends, enabling optimizations and cross-platform deployment.
 
-## Why MLIR/XLA Instead of Direct Backends?
+=== Why MLIR/XLA Instead of Direct Backends?
 
 SKaiNET uses the MLIR/XLA compilation approach rather than implementing separate backends for each hardware target. This design choice provides several key advantages:
 
-**Single Implementation Path**: Write operations once in Kotlin, compile to StableHLO MLIR, then let XLA handle hardware-specific optimizations. No need to maintain separate CUDA, Metal, or ROCm implementations.
+*Single Implementation Path*: Write operations once in Kotlin, compile to StableHLO MLIR, then let XLA handle hardware-specific optimizations. No need to maintain separate CUDA, Metal, or ROCm implementations.
 
-**Automatic Optimization**: XLA provides sophisticated optimizations like operator fusion, memory layout optimization, and hardware-specific kernel selection without manual tuning.
+*Automatic Optimization*: XLA provides sophisticated optimizations like operator fusion, memory layout optimization, and hardware-specific kernel selection without manual tuning.
 
-**Future-Proof**: New hardware targets (like future GPU architectures) are automatically supported when XLA adds support, without requiring SKaiNET updates.
+*Future-Proof*: New hardware targets (like future GPU architectures) are automatically supported when XLA adds support, without requiring SKaiNET updates.
 
-**Ecosystem Integration**: Full compatibility with JAX, TensorFlow, and other MLIR-based frameworks enables model sharing and toolchain reuse.
+*Ecosystem Integration*: Full compatibility with JAX, TensorFlow, and other MLIR-based frameworks enables model sharing and toolchain reuse.
 
-### Key Benefits
+==== Key Benefits
 
-- **Portability**: Write once, compile to any XLA-supported hardware (CPU, GPU, TPU)
-- **Optimization**: Leverage XLA's advanced compiler optimizations and operator fusion
-- **Interoperability**: Full compatibility with XLA, JAX, TensorFlow, and MLIR ecosystems
-- **Performance**: Hardware-specific optimizations without manual kernel development
-- **No Backend Lock-in**: Single compilation target supports all hardware through XLA
+* *Portability*: Write once, compile to any XLA-supported hardware (CPU, GPU, TPU)
+* *Optimization*: Leverage XLA's advanced compiler optimizations and operator fusion
+* *Interoperability*: Full compatibility with XLA, JAX, TensorFlow, and MLIR ecosystems
+* *Performance*: Hardware-specific optimizations without manual kernel development
+* *No Backend Lock-in*: Single compilation target supports all hardware through XLA
 
-## Architecture Overview
+=== Architecture Overview
 
 SKaiNET's HLO compilation pipeline transforms high-level Kotlin DSL operations into hardware-optimized executable code through the MLIR/XLA ecosystem:
 
-```mermaid
+[mermaid]
+----
 graph TD
     A[Kotlin DSL] --> B[Compute Graph]
     B --> C[HLO Converter]
@@ -58,11 +59,12 @@ graph TD
     style A fill:#e1f5fe
     style D fill:#f3e5f5
     style F fill:#e8f5e8
-```
+----
 
-### Data Flow Architecture
+==== Data Flow Architecture
 
-```mermaid
+[mermaid]
+----
 flowchart LR
     subgraph "Input Layer"
         DSL[Kotlin DSL Code]
@@ -92,48 +94,53 @@ flowchart LR
     style DSL fill:#bbdefb
     style Conv fill:#c8e6c9
     style MLIR fill:#ffcdd2
-```
+----
 
-## Building Blocks
+=== Building Blocks
 
-### 1. HLO Converters
+[[1-hlo-converters]]
+==== 1. HLO Converters
 
 Converters transform SKaiNET operations into StableHLO operations:
 
-- **MathOperationsConverter**: Basic arithmetic operations
-- **LinalgOperationsConverter**: Linear algebra operations  
-- **ActivationOperationsConverter**: Neural network activations
-- **NeuralNetOperationsConverter**: High-level NN operations
-- **ConstantOperationsConverter**: Constant value operations
+* *MathOperationsConverter*: Basic arithmetic operations
+* *LinalgOperationsConverter*: Linear algebra operations
+* *ActivationOperationsConverter*: Neural network activations
+* *NeuralNetOperationsConverter*: High-level NN operations
+* *ConstantOperationsConverter*: Constant value operations
 
-### 2. Type System
+[[2-type-system]]
+==== 2. Type System
 
 HLO uses a strict type system for tensors:
 
-```kotlin
+[source,kotlin]
+----
 // SKaiNET tensor type
 Tensor<Float32, Shape4D> // Batch, Channel, Height, Width
 
 // Converts to HLO type
 tensor<1x3x224x224xf32> // StableHLO representation
-```
+----
 
-### 3. Optimization Framework
+[[3-optimization-framework]]
+==== 3. Optimization Framework
 
 The optimization pipeline includes:
 
-- **Shape inference and propagation**
-- **Constant folding and dead code elimination**
-- **Operation fusion for performance**
-- **Memory layout optimization**
+* *Shape inference and propagation*
+* *Constant folding and dead code elimination*
+* *Operation fusion for performance*
+* *Memory layout optimization*
 
-## Practical Example: RGB to Grayscale Conversion
+=== Practical Example: RGB to Grayscale Conversion
 
-Let's walk through converting a color image tensor `Tensor<B,C,H,W>` to grayscale using matrix multiplication.
+Let's walk through converting a color image tensor `Tensor++<++B,C,H,W++>++` to grayscale using matrix multiplication.
 
-### Step 1: Define the Operation in Kotlin DSL
+==== Step 1: Define the Operation in Kotlin DSL
 
-```kotlin
+[source,kotlin]
+----
 // From: skainet-lang/skainet-lang-models/src/commonMain/kotlin/sk/ainet/lang/model/compute/Rgb2GrayScaleMultiply.kt
 fun Tensor<Float32, Shape4D>.rgb2GrayScaleMatMul(): Tensor<Float32, Shape4D> {
     // RGB to grayscale weights: [0.299, 0.587, 0.114]
@@ -151,13 +158,14 @@ fun Tensor<Float32, Shape4D>.rgb2GrayScaleMatMul(): Tensor<Float32, Shape4D> {
     // Reshape back to [B,1,H,W]
     return gray.transpose(intArrayOf(0, 3, 1, 2))
 }
-```
+----
 
-### Step 2: HLO Conversion Process
+==== Step 2: HLO Conversion Process
 
 The conversion pipeline transforms this operation:
 
-```mermaid
+[mermaid]
+----
 sequenceDiagram
     participant DSL as Kotlin DSL
     participant DAG as Compute Graph
@@ -173,13 +181,14 @@ sequenceDiagram
     Opt->>HLO: Optimized IR
     
     Note over Conv,HLO: Type inference:<br/>tensor<BxCxHxWxf32> → tensor<Bx1xHxWxf32>
-```
+----
 
-### Step 3: Generated StableHLO IR
+==== Step 3: Generated StableHLO IR
 
 The converter produces MLIR code like this:
 
-```mlir
+[source,mlir]
+----
 func.func @rgb2grayscale(%input: tensor<?x3x?x?xf32>) -> tensor<?x1x?x?xf32> {
   // Define grayscale conversion weights
   %weights = stablehlo.constant dense<[[0.299], [0.587], [0.114]]> : tensor<3x1xf32>
@@ -199,28 +208,30 @@ func.func @rgb2grayscale(%input: tensor<?x3x?x?xf32>) -> tensor<?x1x?x?xf32> {
   
   return %result : tensor<?x1x?x?xf32>
 }
-```
+----
 
-## Hardware Target Compilation via XLA
+=== Hardware Target Compilation via XLA
 
 SKaiNET uses the MLIR/XLA compilation pipeline to target different hardware platforms without requiring separate backend implementations. The StableHLO IR serves as a portable intermediate representation that XLA can compile to optimized code for various targets.
 
-### Supported Hardware Targets
+==== Supported Hardware Targets
 
-- **CPU**: x86_64, ARM64 (via XLA CPU backend)
-- **GPU**: NVIDIA CUDA, AMD ROCm (via XLA GPU backend)  
-- **TPU**: Google TPUs (via XLA TPU backend)
-- **Mobile**: iOS Metal, Android GPU (via XLA mobile backends)
+* *CPU*: x86++_++64, ARM64 (via XLA CPU backend)
+* *GPU*: NVIDIA CUDA, AMD ROCm (via XLA GPU backend)
+* *TPU*: Google TPUs (via XLA TPU backend)
+* *Mobile*: iOS Metal, Android GPU (via XLA mobile backends)
 
-### Prerequisites for GPU Compilation
+==== Prerequisites for GPU Compilation
 
-1. **XLA with GPU support**: [Installation guide](https://www.tensorflow.org/xla/tutorials/compile)
-2. **NVIDIA CUDA Toolkit** (for NVIDIA GPUs): [Download here](https://developer.nvidia.com/cuda-downloads)
-3. **ROCm** (for AMD GPUs): [Installation guide](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html)
+[arabic]
+. *XLA with GPU support*: https://www.tensorflow.org/xla/tutorials/compile[Installation guide]
+. *NVIDIA CUDA Toolkit* (for NVIDIA GPUs): https://developer.nvidia.com/cuda-downloads[Download here]
+. *ROCm* (for AMD GPUs): https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html[Installation guide]
 
-### Step 1: Generate StableHLO IR
+==== Step 1: Generate StableHLO IR
 
-```bash
+[source,bash]
+----
 # Build SKaiNET HLO compiler
 ./gradlew :skainet-compile:skainet-compile-hlo:build
 
@@ -228,11 +239,12 @@ SKaiNET uses the MLIR/XLA compilation pipeline to target different hardware plat
 ./gradlew :skainet-compile:skainet-compile-hlo:generateHlo \
   -Pmodel=rgb2grayscale \
   -Poutput=rgb2grayscale.mlir
-```
+----
 
-### Step 2: Compile with XLA for Target Hardware
+==== Step 2: Compile with XLA for Target Hardware
 
-```bash
+[source,bash]
+----
 # Compile to GPU executable (NVIDIA CUDA)
 xla_compile \
   --input_format=mlir \
@@ -257,24 +269,26 @@ xla_compile \
   --platform=tpu \
   --input_file=rgb2grayscale.mlir \
   --output_file=rgb2grayscale_tpu.so
-```
+----
 
-### Step 3: Runtime Execution
+==== Step 3: Runtime Execution
 
-```bash
+[source,bash]
+----
 # Execute on target hardware using XLA runtime
 xla_run \
   --executable=rgb2grayscale_cuda.so \
   --input=image.jpg \
   --output=gray.jpg \
   --device=gpu:0
-```
+----
 
-### Jetson and Edge Device Deployment
+==== Jetson and Edge Device Deployment
 
 For NVIDIA Jetson and other edge devices, the same MLIR → XLA compilation approach applies:
 
-```bash
+[source,bash]
+----
 # Cross-compile for ARM64 with CUDA support
 xla_compile \
   --input_format=mlir \
@@ -292,15 +306,16 @@ scp rgb2grayscale_jetson.so jetson@192.168.1.100:~/models/
 ssh jetson@192.168.1.100
 cd ~/models
 xla_run --executable=rgb2grayscale_jetson.so --device=gpu:0
-```
+----
 
-## Advanced Topics
+=== Advanced Topics
 
-### Custom HLO Operations
+==== Custom HLO Operations
 
 Extend SKaiNET with custom operations:
 
-```kotlin
+[source,kotlin]
+----
 // Define custom operation
 @HloOperation("custom.rgb_enhance")
 class RgbEnhanceOp : HloConverter {
@@ -311,34 +326,36 @@ class RgbEnhanceOp : HloConverter {
         """
     }
 }
-```
+----
 
-### Debugging HLO
+==== Debugging HLO
 
 Use SKaiNET's built-in debugging tools:
 
-```kotlin
+[source,kotlin]
+----
 // Enable HLO debugging
 val optimizer = StableHloOptimizer(debugMode = true)
 val optimizedHlo = optimizer.optimize(hloModule)
 
 // Visualize computation graph
 optimizer.dumpGraphviz("rgb2gray.dot")
-```
+----
 
-## Resources and References
+=== Resources and References
 
-- [StableHLO Specification](https://github.com/openxla/stablehlo/blob/main/docs/spec.md)
-- [MLIR Documentation](https://mlir.llvm.org/docs/)
-- [XLA Compilation Guide](https://www.tensorflow.org/xla)
-- [NVIDIA Jetson Documentation](https://docs.nvidia.com/jetson/)
-- [SKaiNET HLO Examples](./examples/hlo/)
+* https://github.com/openxla/stablehlo/blob/main/docs/spec.md[StableHLO Specification]
+* https://mlir.llvm.org/docs/[MLIR Documentation]
+* https://www.tensorflow.org/xla[XLA Compilation Guide]
+* https://docs.nvidia.com/jetson/[NVIDIA Jetson Documentation]
+* link:./examples/hlo/[SKaiNET HLO Examples]
 
-## Next Steps
+=== Next Steps
 
-1. **Explore Examples**: Check `skainet-compile/skainet-compile-hlo/src/commonMain/kotlin/sk/ainet/compile/hlo/examples/`
-2. **Run Tests**: Execute `./gradlew :skainet-compile:skainet-compile-hlo:test`
-3. **Contribute**: Add new HLO converters for missing operations
-4. **Optimize**: Profile and optimize your models using HLO tools
+[arabic]
+. *Explore Examples*: Check `skainet-compile/skainet-compile-hlo/src/commonMain/kotlin/sk/ainet/compile/hlo/examples/`
+. *Run Tests*: Execute `./gradlew :skainet-compile:skainet-compile-hlo:test`
+. *Contribute*: Add new HLO converters for missing operations
+. *Optimize*: Profile and optimize your models using HLO tools
 
-For more detailed information, see the [HLO Optimization Guide](./OPTIMIZATION.md) and [API Documentation](https://docs.skainet.sk/hlo/).
\ No newline at end of file
+For more detailed information, see the link:./OPTIMIZATION.md[HLO Optimization Guide] and https://docs.skainet.sk/hlo/[API Documentation].
diff --git a/docs/java-getting-started.md b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
similarity index 78%
rename from docs/java-getting-started.md
rename to docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
index e64be280..003a6d46 100644
--- a/docs/java-getting-started.md
+++ b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
@@ -1,31 +1,33 @@
-# Java Getting Started Guide
+== Java Getting Started Guide
 
 This guide gets you from zero to running tensor operations with SKaiNET in under 5 minutes. SKaiNET is a Kotlin Multiplatform AI framework, but every JVM-facing API is designed for idiomatic Java usage -- no Kotlin knowledge required.
 
-## Prerequisites
+=== Prerequisites
 
-- **JDK 21 or later** (required for Vector API and virtual threads)
-- **Maven 3.8+** or **Gradle 8.4+**
+* *JDK 21 or later* (required for Vector API and virtual threads)
+* *Maven 3.8{plus}* or *Gradle 8.4{plus}*
 
-## JVM Flags
+=== JVM Flags
 
 SKaiNET uses the Java Vector API for SIMD-accelerated tensor operations. You must pass two flags every time you run your application:
 
-```
+....
 --enable-preview --add-modules jdk.incubator.vector
-```
+....
 
-For Maven Surefire / exec-maven-plugin, add them to `<jvmArgs>`. For Gradle, add them to `jvmArgs` in your run task. Examples are shown below.
+For Maven Surefire / exec-maven-plugin, add them to `++<++jvmArgs++>++`. For Gradle, add them to `jvmArgs` in your run task. Examples are shown below.
 
----
+'''''
 
-## Maven Setup
+=== Maven Setup
 
-### 1. Import the BOM
+[[1-import-the-bom]]
+==== 1. Import the BOM
 
-The `skainet-bom` manages all SKaiNET module versions so you never have to keep them in sync manually. Add it to your `<dependencyManagement>` section:
+The `skainet-bom` manages all SKaiNET module versions so you never have to keep them in sync manually. Add it to your `++<++dependencyManagement++>++` section:
 
-```xml
+[source,xml]
+----
 <project>
     <properties>
         <skainet.version>0.13.0</skainet.version>
@@ -76,13 +78,15 @@ The `skainet-bom` manages all SKaiNET module versions so you never have to keep
         </plugins>
     </build>
 </project>
-```
+----
 
-### 2. Add More Modules as Needed
+[[2-add-more-modules-as-needed]]
+==== 2. Add More Modules as Needed
 
 Because the BOM is imported, you can add any module without repeating the version:
 
-```xml
+[source,xml]
+----
 <!-- LLM inference (KLlama) -->
 <dependency>
     <groupId>sk.ainet</groupId>
@@ -106,13 +110,14 @@ Because the BOM is imported, you can add any module without repeating the versio
     <groupId>sk.ainet</groupId>
     <artifactId>skainet-kllama-agent-jvm</artifactId>
 </dependency>
-```
+----
 
----
+'''''
 
-## Gradle Kotlin DSL Setup
+=== Gradle Kotlin DSL Setup
 
-```kotlin
+[source,kotlin]
+----
 plugins {
     java
     application
@@ -144,15 +149,16 @@ application {
 tasks.withType<JavaCompile> {
     options.compilerArgs.addAll(listOf("--enable-preview"))
 }
-```
+----
 
----
+'''''
 
-## Hello Tensor
+=== Hello Tensor
 
 Create `src/main/java/com/example/HelloTensor.java`:
 
-```java
+[source,java]
+----
 package com.example;
 
 import sk.ainet.java.SKaiNET;
@@ -186,39 +192,43 @@ public class HelloTensor {
         System.out.println("after relu:          " + d);
     }
 }
-```
+----
 
 Run it:
 
-```bash
+[source,bash]
+----
 # Maven
 mvn compile exec:java
 
 # Gradle
 ./gradlew run
-```
+----
 
----
+'''''
 
-## Key Entry Points
+=== Key Entry Points
 
 All Java-facing classes live in the `sk.ainet.java` package:
 
-| Class             | Purpose                                                |
-|-------------------|--------------------------------------------------------|
-| `SKaiNET`         | Static factory -- `context()`, `tensor()`, `zeros()`, `ones()`, `randn()`, `full()` |
-| `TensorJavaOps`   | Static tensor ops -- `matmul()`, `relu()`, `softmax()`, `add()`, `reshape()`, ... |
-| `Losses`          | Loss function factory -- `crossEntropy()`, `mse()`, `binaryCrossEntropy()`, ...     |
-| `Optimizers`      | Optimizer factory -- `adam()`, `adamw()`, `sgd()`                                    |
-| `DType`           | Data type selectors -- `DType.fp32()`, `DType.fp16()`, `DType.bf16()`, `DType.int32()`, ... |
+[cols=",",options="header",]
+|===
+|Class |Purpose
+|`SKaiNET` |Static factory -- `context()`, `tensor()`, `zeros()`, `ones()`, `randn()`, `full()`
+|`TensorJavaOps` |Static tensor ops -- `matmul()`, `relu()`, `softmax()`, `add()`, `reshape()`, ...
+|`Losses` |Loss function factory -- `crossEntropy()`, `mse()`, `binaryCrossEntropy()`, ...
+|`Optimizers` |Optimizer factory -- `adam()`, `adamw()`, `sgd()`
+|`DType` |Data type selectors -- `DType.fp32()`, `DType.fp16()`, `DType.bf16()`, `DType.int32()`, ...
+|===
 
----
+'''''
 
-## Data Types
+=== Data Types
 
 Access data types through static methods on `DType` (from `sk.ainet.lang.types`):
 
-```java
+[source,java]
+----
 import sk.ainet.lang.types.DType;
 
 DType f32  = DType.fp32();    // 32-bit float (default)
@@ -229,15 +239,16 @@ DType i8   = DType.int8();    // 8-bit integer
 DType i32  = DType.int32();   // 32-bit integer
 DType i64  = DType.int64();   // 64-bit integer
 DType u8   = DType.uint8();   // unsigned 8-bit
-```
+----
 
-You can also use the constant fields if you prefer: `DType.FP32_TYPE`, `DType.INT32_TYPE`, etc.
+You can also use the constant fields if you prefer: `DType.FP32++_++TYPE`, `DType.INT32++_++TYPE`, etc.
 
----
+'''''
 
-## Common Tensor Operations
+=== Common Tensor Operations
 
-```java
+[source,java]
+----
 var ctx = SKaiNET.context();
 
 // Creation
@@ -277,11 +288,11 @@ var flat = TensorJavaOps.flatten(a);
 var resh = TensorJavaOps.reshape(a, new int[]{1, -1});
 var sq   = TensorJavaOps.squeeze(a, 0);
 var usq  = TensorJavaOps.unsqueeze(a, 0);
-```
+----
 
----
+'''''
 
-## Next Steps
+=== Next Steps
 
-- [LLM Inference Guide](java-llm-inference.md) -- load GGUF/SafeTensors models, generate text, run BERT embeddings, and build tool-calling agents.
-- [Model Training Guide](java-model-training.md) -- build sequential models, train on MNIST, and run async training loops.
+* link:java-llm-inference.md[LLM Inference Guide] -- load GGUF/SafeTensors models, generate text, run BERT embeddings, and build tool-calling agents.
+* link:java-model-training.md[Model Training Guide] -- build sequential models, train on MNIST, and run async training loops.
diff --git a/docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc
new file mode 100644
index 00000000..153d32ef
--- /dev/null
+++ b/docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc
@@ -0,0 +1,26 @@
+== KLlama Getting Started
+
+KLlama is a pure Kotlin LLaMA inference runtime that runs on JVM, Native, JS, and WebAssembly. It supports GGUF, SafeTensors, and Karpathy .bin model formats with on-the-fly quantization support.
+
+____
+*Early Stage Development*: The project is in active development. We appreciate your feedback and bug reports!
+____
+
+=== Choose Your Path
+
+[cols=",",options="header",]
+|===
+|Goal |Guide
+|*Run models from the command line* |link:../skainet-apps/skainet-kllama-cli/README.md[KLlama CLI]
+|*Embed in a Kotlin application* |link:../skainet-apps/skainet-kllama/README.md[KLlama Library]
+|*Embed in a Java application* |link:java-llm-inference.md[Java LLM Inference Guide]
+|*Build a standalone Java CLI app* |link:java-cli-app.md[Java CLI App Guide]
+|*Java project setup (Maven / Gradle)* |link:java-getting-started.md[Java Getting Started]
+|===
+
+=== Quick Links
+
+* link:++../skainet-apps/skainet-kllama/README.md#supported-formats--quantization++[Supported formats & quantization]
+* link:../skainet-apps/skainet-kllama/README.md#custom-backend-integration[Custom backend integration]
+* link:java-llm-inference.md#agent-loop-and-tool-calling[Agent & tool calling]
+* link:java-llm-inference.md#bert-encoding-and-similarity[BERT embeddings & similarity]
diff --git a/docs/nav.adoc b/docs/nav.adoc
deleted file mode 100644
index f23df7ae..00000000
--- a/docs/nav.adoc
+++ /dev/null
@@ -1,50 +0,0 @@
-= SKaiNET Documentation Navigation
-
-[#main-nav]
-== Main Navigation
-
-* xref:theory/index.adoc[Mathematical Theory]
-** xref:theory/matmul.adoc[Matrix Multiplication]
-* xref:examples/index.adoc[Usage Examples]  
-** xref:examples/matmul-examples.adoc[Matrix Multiplication Examples]
-* xref:modules/operators/_generated_/index.adoc[Generated API Reference]
-
-[#quick-reference]
-== Quick Reference
-
-=== Core Operations
-* xref:theory/matmul.adoc#matmul-definition[Matrix Multiplication Theory]
-* xref:examples/matmul-examples.adoc#basic-usage[Basic Matrix Multiplication]
-* xref:examples/matmul-examples.adoc#neural-network[Neural Network Applications]
-
-=== Documentation Structure
-* `docs/theory/` - Mathematical definitions and theoretical foundations
-* `docs/examples/` - Practical usage examples and code samples
-* `docs/modules/operators/_generated_/` - Auto-generated API reference
-
-[#toc-template]
-== Table of Contents Template
-
-The following template can be used for generating table of contents in documentation pages:
-
-----
-[discrete]
-== Table of Contents
-
-* <<section-anchor,Section Name>>
-** <<subsection-anchor,Subsection Name>>
-* <<another-section,Another Section>>
-----
-
-[#cross-reference-patterns]
-== Cross-Reference Patterns
-
-=== Internal Links
-* Theory to Examples: `xref:../examples/matmul-examples.adoc#basic-usage[Matrix Multiplication Examples]`
-* Examples to Theory: `xref:../theory/matmul.adoc#matmul-definition[Mathematical Definition]`
-* Generated to Human: `xref:../../theory/index.adoc[Theory Reference]`
-
-=== Anchor Naming Conventions
-* Theory anchors: `#operation-definition`, `#operation-properties`, `#operation-complexity`
-* Example anchors: `#basic-usage`, `#advanced-usage`, `#performance-tips`
-* Generated anchors: `#operator-{name}`, `#function-{operator}-{function}`
\ No newline at end of file
diff --git a/docs/perf/java-25-cpu-backend.md b/docs/perf/java-25-cpu-backend.md
deleted file mode 100644
index e66f588d..00000000
--- a/docs/perf/java-25-cpu-backend.md
+++ /dev/null
@@ -1,99 +0,0 @@
-### Java 25 Advantages for the JVM CPU Backend
-
-Java 25 (GA September 2025) delivers significant free performance improvements to the
-SKaiNET JVM CPU backend through JIT/C2 optimizations, faster Panama FFI, and new GC/startup
-features — all without requiring code changes.
-
-#### Compatibility
-
-The same code, same flags, and same runtime detection work across JDK 21–25:
-
-- Vector API remains incubator on JDK 25 (JEP 508) — identical `jdk.incubator.vector` package.
-- Panama FFI finalized in JDK 22; `--enable-preview` is harmless on 22+.
-- Runtime detection (`Class.forName`, `Runtime.version()`) works on all versions.
-- Build config (`jvmTarget = JVM_21`, `options.release.set(21)`) produces compatible bytecode.
-
-**No special treatment is needed for JDK >= 21 but < 25.**
-
-Required flags remain:
-```
---enable-preview --add-modules jdk.incubator.vector
-```
-
-#### JIT / C2 improvements mapped to SKaiNET ops
-
-These are automatic — the JIT produces better native code for existing bytecode.
-
-| Improvement | JDK bug | Speedup | Affected SKaiNET code |
-|---|---|---|---|
-| VPointer refactoring for vector loads/stores | [JDK-8350748](https://bugs.openjdk.org/browse/JDK-8350748) | up to 14x | All `FloatVector.fromArray` / `fromMemorySegment` loops in `JvmVectorKernels.kt`, `JvmQuantizedVectorKernels.kt` |
-| SuperWord SIMD enhancement | [JDK-8343685](https://bugs.openjdk.org/browse/JDK-8343685) | up to 33x | Same vectorized loops (elementwise, reductions, matmul inner loops) |
-| `Math.max` / `Math.min` intrinsified for `long` | JDK-8350485 | 3–5x | Shape computation, tile clamping in blocked matmul |
-
-Source files:
-- `skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmVectorKernels.kt`
-- `skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt`
-
-#### Panama FFI improvements
-
-| Improvement | JDK bug | Speedup | Affected SKaiNET code |
-|---|---|---|---|
-| Faster `MemorySegment` allocation | [JDK-8345687](https://bugs.openjdk.org/browse/JDK-8345687) | ~2x | `MemorySegmentTensorData.kt` (`MemorySegmentTensorDataFactory`), `PagedKvCache.kt` |
-| `MemorySegment::fill` optimized on AArch64 | [JDK-8354674](https://bugs.openjdk.org/browse/JDK-8354674) | ~2.5x | Tensor zeroing, blocked matmul result initialization |
-
-Source files:
-- `skainet-lang/skainet-lang-core/src/jvmMain/kotlin/sk/ainet/lang/tensor/data/MemorySegmentTensorData.kt`
-- `skainet-apps/skainet-kllama/src/jvmMain/kotlin/sk/ainet/apps/kllama/PagedKvCache.kt`
-
-#### Object layout and GC
-
-- **Compact Object Headers** (JEP 519) — reduces object header from 12 to 8 bytes.
-  Meaningful for tensor metadata arrays with millions of small objects.
-  Opt-in: `-XX:+UseCompactObjectHeaders`
-
-- **Generational Shenandoah** (JEP 521) — lower GC pause times for allocation-heavy
-  workloads (tensor creation, KV cache churn).
-  Opt-in: `-XX:+UseShenandoahGC -XX:ShenandoahGCMode=generational`
-
-#### Startup and warmup
-
-- **AOT profiling / caching** (JEP 515) — records JIT profile data from a training run
-  and replays it on subsequent launches. Reduces warmup by 15–25%.
-  Useful for CLI apps like kLLaMA where first-token latency matters.
-
-Usage:
-```
-# Training run (records profile)
-java -XX:AOTCacheOutput=app.aot -jar kllama.jar --prompt "warmup"
-
-# Production run (replays profile)
-java -XX:AOTCache=app.aot -jar kllama.jar --prompt "Hello"
-```
-
-#### Recommended JVM flags for Java 25
-
-Required (same as JDK 21–24):
-```
---enable-preview
---add-modules jdk.incubator.vector
-```
-
-Optional — enable for maximum benefit on JDK 25:
-```
--XX:+UseCompactObjectHeaders
--XX:+UseShenandoahGC -XX:ShenandoahGCMode=generational
--XX:AOTCache=app.aot          # after training run
-```
-
-#### Summary
-
-| Feature | Benefit | Component |
-|---|---|---|
-| VPointer refactoring (C2) | Up to 14x faster vector loads/stores | `JvmVectorKernels`, `JvmQuantizedVectorKernels` |
-| SuperWord SIMD (C2) | Up to 33x faster auto-vectorized loops | Same vector kernel files |
-| `Math.max/min` intrinsic | 3–5x faster long comparisons | Shape computation, tile clamping |
-| Faster segment allocation | ~2x allocation throughput | `MemorySegmentTensorDataFactory`, `PagedKvCache` |
-| `MemorySegment::fill` (AArch64) | ~2.5x faster bulk zeroing | Tensor init, matmul result buffers |
-| Compact Object Headers | ~30% smaller object headers | All tensor metadata |
-| Generational Shenandoah | Lower GC pauses | Allocation-heavy inference |
-| AOT profiling | 15–25% faster warmup | CLI apps (kLLaMA) |
diff --git a/docs/perf/jvm-cpu.md b/docs/perf/jvm-cpu.md
deleted file mode 100644
index fc981566..00000000
--- a/docs/perf/jvm-cpu.md
+++ /dev/null
@@ -1,94 +0,0 @@
-### JVM CPU Backend Performance Benchmarks (JMH)
-
-This page explains how to run the JMH benchmarks for the JVM CPU backend and how to capture evidence for performance targets.
-
-#### What’s included
-- Elementwise: FP32 `add` on 1,000,000 elements
-- Reductions: FP32 `sum` and `mean` on 1,000,000 elements
-- Matmul: FP32 square `matmul` with sizes 256, 512, and 1024
-
-Benchmarks are implemented in module:
-- `:skainet-backends:benchmarks:jvm-cpu-jmh`
-
-Source files:
-- `src/jmh/kotlin/sk/ainet/bench/ElementwiseAdd1MBench.kt`
-- `src/jmh/kotlin/sk/ainet/bench/Reductions1MBench.kt`
-- `src/jmh/kotlin/sk/ainet/bench/MatmulBench.kt`
-
-#### Prerequisites
-- JDK 21+ (JDK 22 toolchain configured by Gradle)
-- Gradle will pass required JVM flags:
-  - `--enable-preview`
-  - `--add-modules jdk.incubator.vector`
-
-For Java 25-specific performance advantages, see [Java 25 CPU Backend](java-25-cpu-backend.md).
-
-#### Feature flags
-You can toggle acceleration paths at runtime using system properties or environment variables:
-- Vector acceleration:
-  - `-Dskainet.cpu.vector.enabled=true|false`
-  - or `SKAINET_CPU_VECTOR_ENABLED=true|false`
-- BLAS via Panama (matmul heuristic for larger sizes):
-  - `-Dskainet.cpu.blas.enabled=true|false`
-  - or `SKAINET_CPU_BLAS_ENABLED=true|false`
-
-Each benchmark also exposes `@Param` to toggle these flags without modifying Gradle args.
-
-#### How to run all benchmarks
-From repository root:
-
-```
-./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh
-```
-
-This will build and execute all JMH benchmarks with the default parameters defined in sources.
-
-#### Run specific benchmarks
-- Elementwise add (both vector on/off):
-```
-./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
-  -Pjmh.include=ElementwiseAdd1MBench
-```
-
-- Reductions (vector on/off):
-```
-./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
-  -Pjmh.include=Reductions1MBench
-```
-
-- Matmul, all sizes, with vector on and BLAS on:
-```
-./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
-  -Pjmh.include=MatmulBench \
-  -Pjmh.param.vectorEnabled=true \
-  -Pjmh.param.blasEnabled=true
-```
-
-- Matmul at 512 only, comparing BLAS on/off with vector on:
-```
-./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
-  -Pjmh.include=MatmulBench \
-  -Pjmh.param.size=512 \
-  -Pjmh.param.vectorEnabled=true \
-  -Pjmh.param.blasEnabled=true,false
-```
-
-Notes:
-- You can also pass system properties via `-D` if preferred (e.g., `-Dskainet.cpu.vector.enabled=false`).
-- JMH JSON/text results can be configured via standard JMH plugin options if you need files for CI artifacts.
-
-#### Recording environment details
-Include at minimum:
-- CPU model, cores/threads, base/boost clock
-- RAM size and speed
-- OS version
-- JDK version and vendor
-- Gradle version
-- JVM flags in use (`--enable-preview --add-modules jdk.incubator.vector`)
-- SKaiNET flags used (vector, BLAS)
-
-#### Performance targets (to be validated on your hardware)
-- ≥ 4× speedup on FP32 `matmul` 512×512 vs baseline scalar
-- ≥ 3× speedup on FP32 `add` with 1M elements vs baseline scalar
-
-Use the above commands to produce “vector=false/blas=false” baselines vs “vector=true[/blas=true]” accelerated runs. Capture best-of or median-of JMH results as evidence and include raw tables in this document when available.
diff --git a/docs/skainet-4-ai.md b/docs/skainet-4-ai.md
deleted file mode 100644
index d9d2c6a6..00000000
--- a/docs/skainet-4-ai.md
+++ /dev/null
@@ -1,127 +0,0 @@
-# SKaiNET Core Technology: Tensor & Data Guide
-
-This document provides technical instructions for AI agents and developers on using SKaiNET's Tensor and Data API as a modern, type-safe replacement for NDArray or Python's NumPy library.
-
-## 1. Fundamental Architecture: Tensor Composition
-
-Unlike traditional libraries where a Tensor is a monolithic object, SKaiNET adopts a **compositional architecture**. A `Tensor<T, V>` is composed of two primary components:
-
-1.  **`TensorData<T, V>`**: Handles multi-dimensional storage, memory layout, indexing, and type-safe element access.
-2.  **`TensorOps`**: Encapsulates mathematical algorithms and transformations (CPU, GPU, etc.).
-
-This separation allows for high flexibility, such as switching execution backends without changing the data representation.
-
-```kotlin
-interface Tensor<T : DType, V> {
-    val data: TensorData<T, V>
-    val ops: TensorOps
-    val dtype: KClass<T>
-    val shape: Shape
-}
-```
-
-## 2. Type-Safe Tensor Creation (DSL)
-
-SKaiNET provides a powerful Type-Safe DSL for tensor creation. It ensures that the data provided matches the specified `DType` at compile-time (or through the DSL's internal validation).
-
-### Creation with `ExecutionContext`
-
-Tensors are always created within an `ExecutionContext`, which provides the necessary `TensorOps` and `TensorDataFactory`.
-
-```kotlin
-// Basic creation
-val zeros = ctx.zeros(Shape(2, 3), FP32::class)
-val ones = ctx.ones(Shape(1, 10), Int32::class)
-val full = ctx.full(Shape(5, 5), FP32::class, 42.0f)
-```
-
-### Expressive Tensor DSL
-
-For more complex initializations, use the `tensor` DSL:
-
-```kotlin
-val myTensor = tensor(ctx, FP32::class) {
-    shape(2, 2) { 
-        from(1.0f, 2.0f, 3.0f, 4.0f) 
-    }
-}
-
-val randomTensor = tensor(ctx, FP32::class) {
-    shape(10, 10) { 
-        randn(mean = 0f, std = 1f) 
-    }
-}
-
-val customInit = tensor(ctx, Int32::class) {
-    shape(5, 5) {
-        init { indices -> indices[0] + indices[1] }
-    }
-}
-```
-
-## 3. Slicing DSL API
-
-SKaiNET offers a sophisticated Slicing DSL that allows for creating views or copies of tensor segments with high precision and readability.
-
-### `sliceView` vs `sliceCopy`
-
-- **`sliceView`**: Creates a `TensorView`, which is a window into the original data (no data copying).
-- **`sliceCopy`**: Creates a new `Tensor` with a copy of the sliced data.
-
-### Slicing DSL Syntax
-
-The `SegmentBuilder` provides several ways to define slices for each dimension:
-
-- `range(start, end)`: A range of indices.
-- `at(index)`: A single index (reduces rank).
-- `all()`: All elements in that dimension (equivalent to `:` in NumPy).
-- `step(start, end, step)`: Strided access.
-- `+all()`: Short-hand for `all()`.
-
-```kotlin
-val source = ctx.ones(Shape(10, 20, 30), FP32::class)
-
-// Slicing: [0:5, 10, :]
-val view = source.sliceView {
-    segment { range(0, 5) } // Dim 0
-    segment { at(10) }      // Dim 1
-    segment { all() }       // Dim 2
-}
-```
-
-## 4. Core Operations (`TensorOps`)
-
-All mathematical operations are dispatched through the `TensorOps` interface. SKaiNET supports:
-
-- **Element-wise Ops**: `add`, `subtract`, `multiply`, `divide` (and scalar versions).
-- **Linear Algebra**: `matmul`, `transpose`.
-- **Neural Network Ops**: `conv2d`, `maxPool2d`, `relu`, `softmax`, `sigmoid`, `gelu`.
-- **Reductions**: `sum`, `mean`, `variance`.
-- **Shape Ops**: `reshape`, `flatten`, `concat`, `squeeze`, `unsqueeze`.
-
-### Operator Overloading
-
-When a tensor is "bound" to ops (e.g., via `OpsBoundTensor`), you can use standard Kotlin operators:
-
-```kotlin
-val c = a + b  // Calls ops.add(a, b)
-val d = a * 10 // Calls ops.mulScalar(a, 10)
-```
-
-## 5. Summary Table: SKaiNET vs NumPy
-
-| Feature | NumPy | SKaiNET |
-| :--- | :--- | :--- |
-| **Primary Type** | `ndarray` | `Tensor<T, V>` |
-| **Creation** | `np.array([1, 2, 3])` | `tensor(ctx, FP32::class) { shape(3) { from(1f, 2f, 3f) } }` |
-| **Zeros** | `np.zeros((2, 2))` | `ctx.zeros(Shape(2, 2), FP32::class)` |
-| **Slicing** | `a[0:5, :]` | `a.sliceView { segment { range(0, 5) }; segment { all() } }` |
-| **Matmul** | `a @ b` or `np.matmul(a, b)` | `ctx.ops.matmul(a, b)` |
-| **Reshape** | `a.reshape(new_shape)` | `ctx.ops.reshape(a, Shape(new_shape))` |
-
-## 6. Best Practices for AI Integration
-
-1.  **Context Awareness**: Always pass the `ExecutionContext` to functions that create or manipulate tensors.
-2.  **Type Safety**: Prefer specific `DType` classes (e.g., `FP32::class`, `Int32::class`) to avoid runtime errors.
-3.  **Views over Copies**: Use `sliceView` whenever possible to minimize memory overhead and improve performance.
-4.  **Backend Agnostic**: Write logic against the `TensorOps` interface to ensure your code runs on any supported backend.

From b98ad96276fa964b7bbd437012f24783011dd65c Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Mon, 13 Apr 2026 15:38:04 +0200
Subject: [PATCH 3/6] Repoint ops-docs generator into Antora module (#494 step
 3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three coupled changes that move the custom
`GenerateDocumentationTask` output from the pre-staged
`docs/modules/operators/_generated_/` tree into the Antora
`modules/ROOT/pages/reference/operators/generated/` tree so
the generated operator fragments actually surface on the
published site.

1. Root `build.gradle.kts` `documentation { }` block:
   outputDirectory now points at
   `docs/modules/ROOT/pages/reference/operators/generated`.

   Dropped the surrounding underscores from `_generated_` to
   avoid AsciiDoc's italic-formatting parser mangling the
   path in xrefs (the Antora build errored with
   `target of xref not found: reference/operators/<em>generated</em>/index.adoc`
   until the rename). The `_generated_` convention was
   pre-staged by the maintainers but never validated through
   a real Antora build.

2. `GenerateDocumentationTask.kt` — `generateMainIndex` now
   emits Antora-compatible xrefs. A new private
   `deriveAntoraXrefPrefix()` helper inspects the output
   directory's absolute path, looks for a
   `/pages/` segment, and returns everything below it as the
   xref prefix (e.g. `reference/operators/generated/`).
   When the output directory is NOT under an Antora module
   (any flat layout), the prefix is an empty string and the
   generator emits bare-filename xrefs, preserving the
   pre-Antora behavior for any callers still using it that
   way.

3. `docs/modules/ROOT/nav.adoc` now has a Reference entry
   `xref:reference/operators/generated/index.adoc[Operator coverage]`.

Old `docs/modules/operators/_generated_/` tree deleted; the
per-operator `.adoc` files tracked as git-mv renames into the
new location (history preserved).

Local build verified: `./gradlew generateDocs` populates the
new location cleanly, Antora renders it with zero errors and
the same 13 pre-existing warnings from the previous commit.

Third step of the six-commit docs-to-Antora migration plan.
See issue #494.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../main/kotlin/GenerateDocumentationTask.kt  | 35 +++++++++++++++++--
 build.gradle.kts                              |  6 ++--
 docs/modules/ROOT/nav.adoc                    |  1 +
 .../reference/operators/generated/index.adoc  | 14 ++++++++
 .../operators/generated}/similarity.adoc      |  0
 .../operators/generated}/voidtensorops.adoc   |  0
 docs/modules/operators/_generated_/index.adoc | 14 --------
 7 files changed, 52 insertions(+), 18 deletions(-)
 create mode 100644 docs/modules/ROOT/pages/reference/operators/generated/index.adoc
 rename docs/modules/{operators/_generated_ => ROOT/pages/reference/operators/generated}/similarity.adoc (100%)
 rename docs/modules/{operators/_generated_ => ROOT/pages/reference/operators/generated}/voidtensorops.adoc (100%)
 delete mode 100644 docs/modules/operators/_generated_/index.adoc

diff --git a/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt b/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt
index 0d29fcfe..38cde1da 100644
--- a/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt
+++ b/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt
@@ -87,6 +87,14 @@ abstract class GenerateDocumentationTask : DefaultTask() {
     
     private fun generateMainIndex(module: OperatorDocModule, outputDir: File) {
         val indexFile = File(outputDir, "index.adoc")
+        // When the output directory sits under an Antora module's
+        // `modules/<name>/pages/` tree, xrefs in the emitted index
+        // must be resolved relative to that `pages/` root, not the
+        // current file. Auto-derive the prefix from the output path
+        // so the generator works both with Antora and with flat doc
+        // layouts (empty prefix -> bare filenames, the original
+        // behavior).
+        val xrefPrefix = deriveAntoraXrefPrefix(outputDir)
         indexFile.writeText(buildString {
             appendLine("= AI-NET Operators Reference")
             appendLine("")
@@ -94,18 +102,41 @@ abstract class GenerateDocumentationTask : DefaultTask() {
             appendLine("")
             appendLine("== Operators by Modality")
             appendLine("")
-            
+
             val operatorsByModality = module.operators.groupBy { it.modality }
             operatorsByModality.forEach { (modality, operators) ->
                 appendLine("=== ${modality.capitalize()}")
                 appendLine("")
                 operators.forEach { operator ->
-                    appendLine("* xref:${operator.name.lowercase()}.adoc[${operator.name}]")
+                    appendLine("* xref:$xrefPrefix${operator.name.lowercase()}.adoc[${operator.name}]")
                 }
                 appendLine("")
             }
         })
     }
+
+    /**
+     * If [outputDir] lives under an Antora `modules/<name>/pages/...`
+     * tree, return the path segment from `pages/` down to the output
+     * directory, suffixed with `/`. Otherwise return an empty string,
+     * so the generator emits bare-filename xrefs (the pre-Antora
+     * behavior).
+     *
+     * Example:
+     * ```
+     * /repo/docs/modules/ROOT/pages/reference/operators/generated
+     *                                → "reference/operators/generated/"
+     * /repo/docs/operators/generated → ""
+     * ```
+     */
+    private fun deriveAntoraXrefPrefix(outputDir: File): String {
+        val path = outputDir.absolutePath.replace(File.separatorChar, '/')
+        val marker = "/pages/"
+        val idx = path.indexOf(marker)
+        if (idx < 0) return ""
+        val tail = path.substring(idx + marker.length)
+        return if (tail.isEmpty()) "" else "$tail/"
+    }
     
     private fun generateOperatorPage(operator: OperatorDoc, module: OperatorDocModule, outputDir: File) {
         val operatorFile = File(outputDir, "${operator.name.lowercase()}.adoc")
diff --git a/build.gradle.kts b/build.gradle.kts
index 27e43398..ca50b41c 100644
--- a/build.gradle.kts
+++ b/build.gradle.kts
@@ -100,10 +100,12 @@ tasks.register("generateOperatorDocs") {
     }
 }
 
-// Documentation plugin configuration
+// Documentation plugin configuration — emits operator doc fragments
+// into the Antora ROOT module so the published site can surface them
+// under Reference > Operator coverage.
 documentation {
     inputFile.set(file("skainet-lang/skainet-lang-core/build/generated/ksp/metadata/commonMain/resources/operators.json"))
-    outputDirectory.set(file("docs/modules/operators/_generated_"))
+    outputDirectory.set(file("docs/modules/ROOT/pages/reference/operators/generated"))
     includeBackendStatus.set(true)
     generateIndex.set(true)
 }
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 64b78995..70065e1b 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -16,6 +16,7 @@
 
 .Reference
 * xref:reference/architecture.adoc[Architecture]
+* xref:reference/operators/generated/index.adoc[Operator coverage]
 * xref:reference/api.adoc[API reference (Dokka)]
 
 .Explanation
diff --git a/docs/modules/ROOT/pages/reference/operators/generated/index.adoc b/docs/modules/ROOT/pages/reference/operators/generated/index.adoc
new file mode 100644
index 00000000..e64fe818
--- /dev/null
+++ b/docs/modules/ROOT/pages/reference/operators/generated/index.adoc
@@ -0,0 +1,14 @@
+= AI-NET Operators Reference
+
+Generated from version `1.0.0` on 2026-04-13
+
+== Operators by Modality
+
+=== Core
+
+* xref:reference/operators/generated/voidtensorops.adoc[VoidTensorOps]
+
+=== Composite
+
+* xref:reference/operators/generated/similarity.adoc[Similarity]
+
diff --git a/docs/modules/operators/_generated_/similarity.adoc b/docs/modules/ROOT/pages/reference/operators/generated/similarity.adoc
similarity index 100%
rename from docs/modules/operators/_generated_/similarity.adoc
rename to docs/modules/ROOT/pages/reference/operators/generated/similarity.adoc
diff --git a/docs/modules/operators/_generated_/voidtensorops.adoc b/docs/modules/ROOT/pages/reference/operators/generated/voidtensorops.adoc
similarity index 100%
rename from docs/modules/operators/_generated_/voidtensorops.adoc
rename to docs/modules/ROOT/pages/reference/operators/generated/voidtensorops.adoc
diff --git a/docs/modules/operators/_generated_/index.adoc b/docs/modules/operators/_generated_/index.adoc
deleted file mode 100644
index e172df10..00000000
--- a/docs/modules/operators/_generated_/index.adoc
+++ /dev/null
@@ -1,14 +0,0 @@
-= AI-NET Operators Reference
-
-Generated from version `1.0.0` on 2026-03-03
-
-== Operators by Modality
-
-=== Core
-
-* xref:voidtensorops.adoc[VoidTensorOps]
-
-=== Composite
-
-* xref:similarity.adoc[Similarity]
-

From 4e75aec87cfdd6d9e1eab18f4c93dda4f6dc1944 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Mon, 13 Apr 2026 15:41:04 +0200
Subject: [PATCH 4/6] Emit cross-backend Operator Coverage Matrix (#494 step 4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends GenerateDocumentationTask with an emitOpsStatusMatrix
pass that writes a single-page operator x backend matrix to
docs/modules/ROOT/pages/reference/ops-status-matrix.adoc —
sibling of the per-operator pages, one click away from the
operator index.

Rows are `Operator.function` pairs; columns are every backend
that appears in any function's statusByBackend map, sorted
for column-order stability across runs. Cells carry emoji:

  ✅ supported / implemented / done
  ⚠️ partial
  ❌ not_supported / missing / unsupported
  ⏳ planned
  🚧 in_progress / wip
  — no claim for this (function, backend) pair

A totals footer row shows "Done = N / total" per backend —
counts any status rendered with ✅. A missing entry is
"unknown" rather than "not supported", and the preamble text
makes that explicit so readers don't over-interpret blanks.

The matrix output path is derived from the task's
outputDirectory via .parentFile.parentFile so it lands at
reference/ops-status-matrix.adoc under the Antora layout
(the per-operator pages live at
reference/operators/generated/). Falls back to writing next
to outputDir when the parents are missing, preserving the
task's behavior for non-Antora callers.

Also widens shortStatus() to cover the implementation-style
vocabulary the KSP processor actually emits today
(`implemented`, `in_progress`, `missing`) alongside the
planning-style vocabulary (`supported`, `partial`, `planned`,
`not_supported`). Unknown values fall back to the raw string
so the matrix never silently hides a status the generator
didn't anticipate — strictly more permissive than the
existing `formatStatus` used by the per-function table.

Nav entry added under Reference:
  xref:reference/ops-status-matrix.adoc[Operator coverage matrix]

Local build verified: matrix file generates with real KSP
data, renders through Antora with zero errors and the same
13 pre-existing warnings.

Fourth step of the six-commit docs-to-Antora migration plan.
See issue #494.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../main/kotlin/GenerateDocumentationTask.kt  | 125 +++++++++++++++++-
 docs/modules/ROOT/nav.adoc                    |   3 +-
 .../pages/reference/ops-status-matrix.adoc    |  19 +++
 3 files changed, 144 insertions(+), 3 deletions(-)
 create mode 100644 docs/modules/ROOT/pages/reference/ops-status-matrix.adoc

diff --git a/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt b/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt
index 38cde1da..845c8e2f 100644
--- a/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt
+++ b/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt
@@ -65,14 +65,135 @@ abstract class GenerateDocumentationTask : DefaultTask() {
     
     private fun generateAsciidoc(module: OperatorDocModule, outputDir: File) {
         outputDir.mkdirs()
-        
+
         if (generateIndex.getOrElse(true)) {
             generateMainIndex(module, outputDir)
         }
-        
+
         module.operators.forEach { operator ->
             generateOperatorPage(operator, module, outputDir)
         }
+
+        // Sibling cross-backend coverage matrix. Lives one level above
+        // the per-operator pages so a single URL gives the whole
+        // picture. Skipped when includeBackendStatus is disabled.
+        if (includeBackendStatus.getOrElse(true)) {
+            emitOpsStatusMatrix(module, outputDir)
+        }
+    }
+
+    /**
+     * Emit a single-page `ops-status-matrix.adoc` with rows of
+     * operator.function pairs and columns of every backend that
+     * appears in any function's `statusByBackend` map. Cells carry
+     * the status emoji; a totals footer shows how many functions
+     * each backend supports out of the total.
+     *
+     * Written to [outputDir].parentFile.parentFile so that, under the
+     * Antora `reference/operators/generated/` layout, the matrix
+     * lands at `reference/ops-status-matrix.adoc` — one navigable
+     * click away from the operator index and with a stable URL.
+     * Falls back to writing next to [outputDir] when the path
+     * doesn't have the expected depth (flat layouts).
+     */
+    private fun emitOpsStatusMatrix(module: OperatorDocModule, outputDir: File) {
+        val matrixDir = outputDir.parentFile?.parentFile ?: outputDir
+        matrixDir.mkdirs()
+        val matrixFile = File(matrixDir, "ops-status-matrix.adoc")
+
+        // Collect every backend that appears anywhere, sorted so the
+        // column order is stable across runs.
+        val allBackends: List<String> = module.operators
+            .flatMap { op -> op.functions.flatMap { it.statusByBackend.keys } }
+            .toSortedSet()
+            .toList()
+
+        // Row view: (operator, function) pair -> per-backend status.
+        data class Row(val operator: String, val function: String, val status: Map<String, String>)
+        val rows: List<Row> = module.operators.flatMap { op ->
+            op.functions.map { fn -> Row(op.name, fn.name, fn.statusByBackend) }
+        }
+
+        matrixFile.writeText(buildString {
+            appendLine("= Operator Coverage Matrix")
+            appendLine(":description: Cross-backend status for every operator function in SKaiNET.")
+            appendLine("")
+            appendLine("Generated from `operators.json` version `${module.version}` on ${formatTimestamp(module.timestamp)}.")
+            appendLine("")
+            appendLine("Rows are `Operator.function` pairs; columns are backends that appear in any function's `statusByBackend` map. A missing entry means the backend makes no claim about the function — treat it as \"unknown\", not \"not supported\".")
+            appendLine("")
+            if (rows.isEmpty() || allBackends.isEmpty()) {
+                appendLine("NOTE: No backend status information found in the source data.")
+                appendLine("")
+                return@buildString
+            }
+
+            // Table header: 1 col for the row label + 1 col per backend.
+            val colSpec = (listOf("2") + List(allBackends.size) { "1" }).joinToString(",")
+            appendLine("[cols=\"$colSpec\", options=\"header\"]")
+            appendLine("|===")
+            append("| Operator.function ")
+            allBackends.forEach { append("| $it ") }
+            appendLine("")
+            appendLine("")
+
+            rows.forEach { row ->
+                append("| `${row.operator}.${row.function}` ")
+                allBackends.forEach { backend ->
+                    val raw = row.status[backend]
+                    val cell = if (raw == null) "—" else shortStatus(raw)
+                    append("| $cell ")
+                }
+                appendLine("")
+            }
+
+            // Totals footer: number of "done" rows per backend out
+            // of total row count. A status counts as done when it
+            // maps to the green check in shortStatus.
+            appendLine("")
+            append("| *Done* ")
+            allBackends.forEach { backend ->
+                val n = rows.count { isDone(it.status[backend]) }
+                append("| *$n / ${rows.size}* ")
+            }
+            appendLine("")
+            appendLine("|===")
+            appendLine("")
+            appendLine("Per-function detail including notes lives in xref:reference/operators/generated/index.adoc[Operator reference].")
+        })
+    }
+
+    /**
+     * Short emoji-only rendering of a backend status, for use in the
+     * compact matrix cells. The long-form wording stays on the
+     * per-function backend-status table produced by
+     * [generateBackendStatusTable].
+     *
+     * The vocabulary covers both the planning-style strings
+     * (`supported` / `partial` / `not_supported` / `planned`) and
+     * the implementation-style strings the KSP processor actually
+     * emits today (`implemented` / `in_progress` / `missing`).
+     * Unknown values fall back to the raw string so the matrix
+     * never silently hides a status the generator didn't anticipate.
+     */
+    private fun shortStatus(status: String): String = when (status.lowercase()) {
+        "supported", "implemented", "done" -> "✅"
+        "partial" -> "⚠️"
+        "not_supported", "missing", "unsupported" -> "❌"
+        "planned" -> "⏳"
+        "in_progress", "wip" -> "🚧"
+        else -> status
+    }
+
+    /**
+     * Whether a status string counts toward the totals footer in
+     * the ops-status matrix. Mirrors the "green check" branch of
+     * [shortStatus] — any status rendered with ✅ is counted as
+     * done.
+     */
+    private fun isDone(status: String?): Boolean = when (status?.lowercase()) {
+        "supported", "implemented", "done" -> true
+        else -> false
     }
     
     private fun generateMarkdown(module: OperatorDocModule, outputDir: File) {
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
index 70065e1b..70418739 100644
--- a/docs/modules/ROOT/nav.adoc
+++ b/docs/modules/ROOT/nav.adoc
@@ -16,7 +16,8 @@
 
 .Reference
 * xref:reference/architecture.adoc[Architecture]
-* xref:reference/operators/generated/index.adoc[Operator coverage]
+* xref:reference/operators/generated/index.adoc[Operator reference]
+* xref:reference/ops-status-matrix.adoc[Operator coverage matrix]
 * xref:reference/api.adoc[API reference (Dokka)]
 
 .Explanation
diff --git a/docs/modules/ROOT/pages/reference/ops-status-matrix.adoc b/docs/modules/ROOT/pages/reference/ops-status-matrix.adoc
new file mode 100644
index 00000000..6ee3957d
--- /dev/null
+++ b/docs/modules/ROOT/pages/reference/ops-status-matrix.adoc
@@ -0,0 +1,19 @@
+= Operator Coverage Matrix
+:description: Cross-backend status for every operator function in SKaiNET.
+
+Generated from `operators.json` version `1.0.0` on 2026-04-13.
+
+Rows are `Operator.function` pairs; columns are backends that appear in any function's `statusByBackend` map. A missing entry means the backend makes no claim about the function — treat it as "unknown", not "not supported".
+
+[cols="2,1,1,1,1", options="header"]
+|===
+| Operator.function | Metal | apple | cpu | wasm 
+
+| `VoidTensorOps.matmul` | 🚧 | — | — | — 
+| `VoidTensorOps.transpose` | 🚧 | — | — | — 
+| `Similarity.cosineDistance` | — | ✅ | ✅ | ✅ 
+
+| *Done* | *0 / 3* | *1 / 3* | *1 / 3* | *1 / 3* 
+|===
+
+Per-function detail including notes lives in xref:reference/operators/generated/index.adoc[Operator reference].

From 4754ef93dc5d36fdd78116c1d171d03b04710d57 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Mon, 13 Apr 2026 15:43:29 +0200
Subject: [PATCH 5/6] Add docs workflow, delete dokka-pages.yml (#494 step 5)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New .github/workflows/docs.yml that builds the Antora site on
every PR and publishes it to GitHub Pages on pushes to develop.
Mirrors the SKaiNET-transformers workflow layout but adds
several SKaiNET-specific pre-steps and a buildx cache.

Workflow steps (build-docs job):

  1. Checkout.
  2. actions/setup-java@v4 — JDK 25 on the runner, matching
     every other workflow in this repo. Runs outside the
     Antora container so the Gradle wrapper cache is
     reusable and generateDocs / dokkaGenerate see the right
     JDK.
  3. Cache Gradle caches + wrapper for subsequent runs.
  4. ./gradlew generateDocs dokkaGenerate — emits the KSP
     operator fragments + the ops-status-matrix.adoc into
     docs/modules/ROOT/pages/reference/operators/ AND builds
     the full Dokka HTML aggregate. Running dokka here (not
     just in commit 6) means commit 6 is a pure
     workflow-step + task-registration change with no extra
     Gradle cost.
  5. docker/setup-buildx-action + docker/build-push-action
     with gha cache. The Chromium layer makes the image
     ~400 MB so first build is 3–5 minutes; subsequent runs
     are sub-minute via cache-from/to. Transformers skipped
     caching here; this workflow improves on that.
  6. Docker run Antora inside the container, volume-mounting
     the workspace, workdir /antora/docs, entrypoint is the
     image's baked-in antora binary.
  7. actions/upload-pages-artifact@v3 with path
     docs/build/site.

Deploy job (deploy-docs) is gated on push to develop and
consumes the uploaded artifact via actions/deploy-pages@v4.

Triggers fire on paths that actually affect the docs site:
everything under docs/, the workflow itself, the root
build.gradle.kts, build-logic/, and skainet-lang-core (the
KSP source the operator generator reads).

.github/workflows/dokka-pages.yml is deleted. Only one
workflow can own GitHub Pages and the new one publishes both
Antora and (after commit 6) the bundled Dokka output.
Deleting it in the same commit as enabling docs.yml creates
a brief ~1 minute 404 window between the merge and the first
new-workflow run; trigger a workflow_dispatch immediately
post-merge to close it.

bundleDokkaIntoSite is NOT invoked yet — that Gradle task
does not exist until commit 6. Commit 5 can therefore be
tested end-to-end (Antora builds, pages deploy) without
Dokka being visible under /api/. Commit 6 adds the task and
the one-line workflow edit to wire it in.

Yaml linted with `python3 -c 'import yaml; yaml.safe_load(...)'`.

Fifth step of the six-commit docs-to-Antora migration plan.
See issue #494.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/docs.yml        | 114 ++++++++++++++++++++++++++++++
 .github/workflows/dokka-pages.yml |  56 ---------------
 2 files changed, 114 insertions(+), 56 deletions(-)
 create mode 100644 .github/workflows/docs.yml
 delete mode 100644 .github/workflows/dokka-pages.yml

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 00000000..e470924a
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,114 @@
+name: Docs
+
+# Build the Antora site (with generated operator pages and the
+# cross-backend coverage matrix) on every PR, and publish to GitHub
+# Pages on pushes to develop. Dokka API bundling is wired in
+# commit 6 of the docs-to-Antora migration (see issue #494).
+
+on:
+  push:
+    branches: [ main, develop ]
+    paths:
+      - 'docs/**'
+      - '.github/workflows/docs.yml'
+      - 'build.gradle.kts'
+      - 'build-logic/**'
+      - 'skainet-lang/skainet-lang-core/**'
+  pull_request:
+    paths:
+      - 'docs/**'
+      - '.github/workflows/docs.yml'
+      - 'build.gradle.kts'
+      - 'build-logic/**'
+      - 'skainet-lang/skainet-lang-core/**'
+  workflow_dispatch:
+
+concurrency:
+  group: docs-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+jobs:
+  build-docs:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      # JDK 25 matches the version used by every other workflow in
+      # this repo. Runs on the RUNNER, not inside the Docker
+      # container, so the Gradle wrapper cache works and generateDocs
+      # / dokkaGenerate see the right JDK.
+      - name: Set up JDK
+        uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: '25'
+
+      - name: Cache Gradle
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.gradle/caches
+            ~/.gradle/wrapper
+          key: gradle-${{ runner.os }}-${{ hashFiles('**/*.gradle*', '**/gradle-wrapper.properties', '**/libs.versions.toml') }}
+          restore-keys: |
+            gradle-${{ runner.os }}-
+
+      # Emit the KSP-driven operator fragments and the coverage
+      # matrix into docs/modules/ROOT/pages/reference/operators/.
+      # Also generate the full Dokka API aggregate so commit 6 can
+      # bundle it; running both here means commit 6 is a pure
+      # workflow-step + Gradle-task-registration change with no
+      # Gradle re-run cost.
+      - name: Generate operator docs and Dokka
+        run: ./gradlew --no-daemon generateDocs dokkaGenerate
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      # The Chromium layer makes the image ~400 MB. First build is
+      # ~3–5 minutes; subsequent runs are sub-minute via the GHA
+      # cache. Transformers skipped caching here — this workflow
+      # improves on that.
+      - name: Build Antora image
+        uses: docker/build-push-action@v5
+        with:
+          context: docs/.docker
+          tags: skainet-antora:local
+          load: true
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+      - name: Build Antora site
+        run: |
+          docker run --rm \
+            -v "${{ github.workspace }}:/antora" \
+            --workdir /antora/docs \
+            skainet-antora:local \
+            --stacktrace \
+            antora-playbook.yml
+
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: docs/build/site
+
+  deploy-docs:
+    if: github.ref == 'refs/heads/develop' && github.event_name == 'push'
+    needs: build-docs
+    runs-on: ubuntu-latest
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
diff --git a/.github/workflows/dokka-pages.yml b/.github/workflows/dokka-pages.yml
deleted file mode 100644
index ec20dd17..00000000
--- a/.github/workflows/dokka-pages.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-name: Dokka API Docs → GitHub Pages
-
-on:
-  push:
-    branches: [ main, feature/14-dokka ]
-  workflow_dispatch:
-
-permissions:
-  contents: read
-  pages: write
-  id-token: write
-
-concurrency:
-  group: pages
-  cancel-in-progress: false
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    timeout-minutes: 60
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-
-      - name: Copy CI gradle.properties
-        run: mkdir -p ~/.gradle ; cp .github/ci-gradle.properties ~/.gradle/gradle.properties
-
-      - name: Set up JDK 25
-        uses: actions/setup-java@v5
-        with:
-          distribution: 'zulu'
-          java-version: 25
-
-      - name: Setup Gradle
-        uses: gradle/actions/setup-gradle@v6
-
-      - name: Generate Dokka HTML
-        run: ./gradlew dokkaGenerate --no-daemon
-
-      - name: Upload Pages artifact
-        uses: actions/upload-pages-artifact@v4
-        with:
-          path: build/dokka/html
-
-  deploy:
-    needs: build
-    runs-on: ubuntu-latest
-    environment:
-      name: github-pages
-      url: ${{ steps.deployment.outputs.page_url }}
-
-    steps:
-      - name: Deploy to GitHub Pages
-        id: deployment
-        uses: actions/deploy-pages@v5

From 30488cd235d44c8b1a8fe18fc9663dffa86a4c06 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Mon, 13 Apr 2026 15:49:03 +0200
Subject: [PATCH 6/6] Wire Dokka API bundle into Antora site (#494 step 6)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two coupled changes that activate the sibling-path Dokka
bundle under /api/ of the published site.

1. Root build.gradle.kts registers a new `bundleDokkaIntoSite`
   Copy task under the `documentation` group:

       tasks.register<Copy>("bundleDokkaIntoSite") {
           dependsOn("dokkaGenerate")
           from(layout.buildDirectory.dir("dokka/html"))
           into(layout.projectDirectory.dir("docs/build/site/api"))
       }

   The task is INTENTIONALLY NOT wired into the `build` lifecycle.
   It's invoked only by the docs workflow, and only AFTER Antora
   has populated docs/build/site/. Pre-creating the target
   directory from a Gradle build would cause the later Antora run
   to wipe the api/ subtree on its fresh write; keeping the task
   workflow-invoked only avoids that ordering trap.

2. .github/workflows/docs.yml gets a new step between "Build
   Antora site" and "Upload artifact":

       - name: Bundle Dokka API into site
         run: ./gradlew --no-daemon bundleDokkaIntoSite

   Dokka itself is already generated upstream in the existing
   "Generate operator docs and Dokka" step from commit 5, so
   this is a pure Copy — no re-run cost.

Net effect: the published site contains a full Antora site at
the root and the full Dokka multi-module API aggregate under
/api/. The reference/api.adoc page's
`link:../api/index.html[API Reference]` stub from commit 2
resolves to a real page. Same origin, no CORS, served from the
same GitHub Pages site as the Antora content.

End-to-end local verification:

    rm -rf docs/build/site
    docker run --rm -v "$PWD:/antora" -w /antora \
        skainet-antora:local docs/antora-playbook.yml
    ./gradlew --no-daemon bundleDokkaIntoSite
    ls docs/build/site/       # index.html, skainet/, api/
    ls docs/build/site/api/   # index.html, skainet-lang/,
                              # skainet-compile/, skainet-backends/,
                              # skainet-io/, skainet-data/, ...

Sixth and final step of the docs-to-Antora migration plan.
See issue #494.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/docs.yml |  8 ++++++++
 build.gradle.kts           | 13 +++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index e470924a..32a7ed1e 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -95,6 +95,14 @@ jobs:
             --stacktrace \
             antora-playbook.yml
 
+      # Bundle Dokka HTML under a sibling `/api/` path of the
+      # Antora site. Must run AFTER Antora has populated
+      # docs/build/site/, never before — bundleDokkaIntoSite is a
+      # plain Copy task that would otherwise pre-create the target
+      # directory and the later Antora run would wipe it.
+      - name: Bundle Dokka API into site
+        run: ./gradlew --no-daemon bundleDokkaIntoSite
+
       - name: Upload artifact
         uses: actions/upload-pages-artifact@v3
         with:
diff --git a/build.gradle.kts b/build.gradle.kts
index ca50b41c..0df84bb7 100644
--- a/build.gradle.kts
+++ b/build.gradle.kts
@@ -155,4 +155,17 @@ dependencies {
     // Other
     dokka(project(":skainet-pipeline"))
     dokka(project(":skainet-models:skainet-model-yolo"))
+}
+
+// Copy the Dokka-generated HTML aggregate into the Antora site
+// output as a sibling `/api/` path. Invoked by .github/workflows/docs.yml
+// AFTER Antora has populated `docs/build/site/`; intentionally NOT
+// wired into the `build` lifecycle so that running `./gradlew build`
+// locally never silently creates a half-populated site directory.
+tasks.register<Copy>("bundleDokkaIntoSite") {
+    group = "documentation"
+    description = "Copy build/dokka/html into docs/build/site/api for GitHub Pages publish"
+    dependsOn("dokkaGenerate")
+    from(layout.buildDirectory.dir("dokka/html"))
+    into(layout.projectDirectory.dir("docs/build/site/api"))
 }
\ No newline at end of file