diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 00000000..32a7ed1e
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,122 @@
+name: Docs
+
+# Build the Antora site (with generated operator pages and the
+# cross-backend coverage matrix) on every PR, and publish to GitHub
+# Pages on pushes to develop. Dokka API bundling is wired in
+# commit 6 of the docs-to-Antora migration (see issue #494).
+
+on:
+  push:
+    branches: [ main, develop ]
+    paths:
+      - 'docs/**'
+      - '.github/workflows/docs.yml'
+      - 'build.gradle.kts'
+      - 'build-logic/**'
+      - 'skainet-lang/skainet-lang-core/**'
+  pull_request:
+    paths:
+      - 'docs/**'
+      - '.github/workflows/docs.yml'
+      - 'build.gradle.kts'
+      - 'build-logic/**'
+      - 'skainet-lang/skainet-lang-core/**'
+  workflow_dispatch:
+
+concurrency:
+  group: docs-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+jobs:
+  build-docs:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      # JDK 25 matches the version used by every other workflow in
+      # this repo. Runs on the RUNNER, not inside the Docker
+      # container, so the Gradle wrapper cache works and generateDocs
+      # / dokkaGenerate see the right JDK.
+      - name: Set up JDK
+        uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: '25'
+
+      - name: Cache Gradle
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.gradle/caches
+            ~/.gradle/wrapper
+          key: gradle-${{ runner.os }}-${{ hashFiles('**/*.gradle*', '**/gradle-wrapper.properties', '**/libs.versions.toml') }}
+          restore-keys: |
+            gradle-${{ runner.os }}-
+
+      # Emit the KSP-driven operator fragments and the coverage
+      # matrix into docs/modules/ROOT/pages/reference/operators/.
+      # Also generate the full Dokka API aggregate so commit 6 can
+      # bundle it; running both here means commit 6 is a pure
+      # workflow-step + Gradle-task-registration change with no
+      # Gradle re-run cost.
+      - name: Generate operator docs and Dokka
+        run: ./gradlew --no-daemon generateDocs dokkaGenerate
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      # The Chromium layer makes the image ~400 MB. First build is
+      # ~3–5 minutes; subsequent runs are sub-minute via the GHA
+      # cache. Transformers skipped caching here — this workflow
+      # improves on that.
+      - name: Build Antora image
+        uses: docker/build-push-action@v5
+        with:
+          context: docs/.docker
+          tags: skainet-antora:local
+          load: true
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+      - name: Build Antora site
+        run: |
+          docker run --rm \
+            -v "${{ github.workspace }}:/antora" \
+            --workdir /antora/docs \
+            skainet-antora:local \
+            --stacktrace \
+            antora-playbook.yml
+
+      # Bundle Dokka HTML under a sibling `/api/` path of the
+      # Antora site. Must run AFTER Antora has populated
+      # docs/build/site/, never before — bundleDokkaIntoSite is a
+      # plain Copy task that would otherwise pre-create the target
+      # directory and the later Antora run would wipe it.
+      - name: Bundle Dokka API into site
+        run: ./gradlew --no-daemon bundleDokkaIntoSite
+
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: docs/build/site
+
+  deploy-docs:
+    if: github.ref == 'refs/heads/develop' && github.event_name == 'push'
+    needs: build-docs
+    runs-on: ubuntu-latest
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
diff --git a/.github/workflows/dokka-pages.yml b/.github/workflows/dokka-pages.yml
deleted file mode 100644
index ec20dd17..00000000
--- a/.github/workflows/dokka-pages.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-name: Dokka API Docs → GitHub Pages
-
-on:
-  push:
-    branches: [ main, feature/14-dokka ]
-  workflow_dispatch:
-
-permissions:
-  contents: read
-  pages: write
-  id-token: write
-
-concurrency:
-  group: pages
-  cancel-in-progress: false
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    timeout-minutes: 60
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-
-      - name: Copy CI gradle.properties
-        run: mkdir -p ~/.gradle ; cp .github/ci-gradle.properties ~/.gradle/gradle.properties
-
-      - name: Set up JDK 25
-        uses: actions/setup-java@v5
-        with:
-          distribution: 'zulu'
-          java-version: 25
-
-      - name: Setup Gradle
-        uses: gradle/actions/setup-gradle@v6
-
-      - name: Generate Dokka HTML
-        run: ./gradlew dokkaGenerate --no-daemon
-
-      - name: Upload Pages artifact
-        uses: actions/upload-pages-artifact@v4
-        with:
-          path: build/dokka/html
-
-  deploy:
-    needs: build
-    runs-on: ubuntu-latest
-    environment:
-      name: github-pages
-      url: ${{ steps.deployment.outputs.page_url }}
-
-    steps:
-      - name: Deploy to GitHub Pages
-        id: deployment
-        uses: actions/deploy-pages@v5
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 564b49fc..aecc5c0f 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -1,4 +1,4 @@
-# 🏗️ Architecture
-SKaiNET uses a hybrid backend strategy that separates development iteration from production deployment.
+# Architecture
 
-![Architecture diagram of SKaiNET compiler](docs/SKaiNET-compiler.svg)
+See the published site:
+https://skainet-developers.github.io/SKaiNET/skainet/reference/architecture.html
diff --git a/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt b/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt
index 0d29fcfe..845c8e2f 100644
--- a/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt
+++ b/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt
@@ -65,14 +65,135 @@ abstract class GenerateDocumentationTask : DefaultTask() {
     
     private fun generateAsciidoc(module: OperatorDocModule, outputDir: File) {
         outputDir.mkdirs()
-        
+
         if (generateIndex.getOrElse(true)) {
             generateMainIndex(module, outputDir)
         }
-        
+
         module.operators.forEach { operator ->
             generateOperatorPage(operator, module, outputDir)
         }
+
+        // Sibling cross-backend coverage matrix. Lives one level above
+        // the per-operator pages so a single URL gives the whole
+        // picture. Skipped when includeBackendStatus is disabled.
+        if (includeBackendStatus.getOrElse(true)) {
+            emitOpsStatusMatrix(module, outputDir)
+        }
+    }
+
+    /**
+     * Emit a single-page `ops-status-matrix.adoc` with rows of
+     * operator.function pairs and columns of every backend that
+     * appears in any function's `statusByBackend` map. Cells carry
+     * the status emoji; a totals footer shows how many functions
+     * each backend supports out of the total.
+     *
+     * Written to [outputDir].parentFile.parentFile so that, under the
+     * Antora `reference/operators/generated/` layout, the matrix
+     * lands at `reference/ops-status-matrix.adoc` — one navigable
+     * click away from the operator index and with a stable URL.
+     * Falls back to writing next to [outputDir] when the path
+     * doesn't have the expected depth (flat layouts).
+     */
+    private fun emitOpsStatusMatrix(module: OperatorDocModule, outputDir: File) {
+        val matrixDir = outputDir.parentFile?.parentFile ?: outputDir
+        matrixDir.mkdirs()
+        val matrixFile = File(matrixDir, "ops-status-matrix.adoc")
+
+        // Collect every backend that appears anywhere, sorted so the
+        // column order is stable across runs.
+        val allBackends: List<String> = module.operators
+            .flatMap { op -> op.functions.flatMap { it.statusByBackend.keys } }
+            .toSortedSet()
+            .toList()
+
+        // Row view: (operator, function) pair -> per-backend status.
+        data class Row(val operator: String, val function: String, val status: Map<String, String>)
+        val rows: List<Row> = module.operators.flatMap { op ->
+            op.functions.map { fn -> Row(op.name, fn.name, fn.statusByBackend) }
+        }
+
+        matrixFile.writeText(buildString {
+            appendLine("= Operator Coverage Matrix")
+            appendLine(":description: Cross-backend status for every operator function in SKaiNET.")
+            appendLine("")
+            appendLine("Generated from `operators.json` version `${module.version}` on ${formatTimestamp(module.timestamp)}.")
+            appendLine("")
+            appendLine("Rows are `Operator.function` pairs; columns are backends that appear in any function's `statusByBackend` map. A missing entry means the backend makes no claim about the function — treat it as \"unknown\", not \"not supported\".")
+            appendLine("")
+            if (rows.isEmpty() || allBackends.isEmpty()) {
+                appendLine("NOTE: No backend status information found in the source data.")
+                appendLine("")
+                return@buildString
+            }
+
+            // Table header: 1 col for the row label + 1 col per backend.
+            val colSpec = (listOf("2") + List(allBackends.size) { "1" }).joinToString(",")
+            appendLine("[cols=\"$colSpec\", options=\"header\"]")
+            appendLine("|===")
+            append("| Operator.function ")
+            allBackends.forEach { append("| $it ") }
+            appendLine("")
+            appendLine("")
+
+            rows.forEach { row ->
+                append("| `${row.operator}.${row.function}` ")
+                allBackends.forEach { backend ->
+                    val raw = row.status[backend]
+                    val cell = if (raw == null) "—" else shortStatus(raw)
+                    append("| $cell ")
+                }
+                appendLine("")
+            }
+
+            // Totals footer: number of "done" rows per backend out
+            // of total row count. A status counts as done when it
+            // maps to the green check in shortStatus.
+            appendLine("")
+            append("| *Done* ")
+            allBackends.forEach { backend ->
+                val n = rows.count { isDone(it.status[backend]) }
+                append("| *$n / ${rows.size}* ")
+            }
+            appendLine("")
+            appendLine("|===")
+            appendLine("")
+            appendLine("Per-function detail including notes lives in xref:reference/operators/generated/index.adoc[Operator reference].")
+        })
+    }
+
+    /**
+     * Short emoji-only rendering of a backend status, for use in the
+     * compact matrix cells. The long-form wording stays on the
+     * per-function backend-status table produced by
+     * [generateBackendStatusTable].
+     *
+     * The vocabulary covers both the planning-style strings
+     * (`supported` / `partial` / `not_supported` / `planned`) and
+     * the implementation-style strings the KSP processor actually
+     * emits today (`implemented` / `in_progress` / `missing`).
+     * Unknown values fall back to the raw string so the matrix
+     * never silently hides a status the generator didn't anticipate.
+     */
+    private fun shortStatus(status: String): String = when (status.lowercase()) {
+        "supported", "implemented", "done" -> "✅"
+        "partial" -> "⚠️"
+        "not_supported", "missing", "unsupported" -> "❌"
+        "planned" -> "⏳"
+        "in_progress", "wip" -> "🚧"
+        else -> status
+    }
+
+    /**
+     * Whether a status string counts toward the totals footer in
+     * the ops-status matrix. Mirrors the "green check" branch of
+     * [shortStatus] — any status rendered with ✅ is counted as
+     * done.
+     */
+    private fun isDone(status: String?): Boolean = when (status?.lowercase()) {
+        "supported", "implemented", "done" -> true
+        else -> false
     }
     
     private fun generateMarkdown(module: OperatorDocModule, outputDir: File) {
@@ -87,6 +208,14 @@ abstract class GenerateDocumentationTask : DefaultTask() {
     
     private fun generateMainIndex(module: OperatorDocModule, outputDir: File) {
         val indexFile = File(outputDir, "index.adoc")
+        // When the output directory sits under an Antora module's
+        // `modules/<name>/pages/` tree, xrefs in the emitted index
+        // must be resolved relative to that `pages/` root, not the
+        // current file. Auto-derive the prefix from the output path
+        // so the generator works both with Antora and with flat doc
+        // layouts (empty prefix -> bare filenames, the original
+        // behavior).
+        val xrefPrefix = deriveAntoraXrefPrefix(outputDir)
         indexFile.writeText(buildString {
             appendLine("= AI-NET Operators Reference")
             appendLine("")
@@ -94,18 +223,41 @@ abstract class GenerateDocumentationTask : DefaultTask() {
             appendLine("")
             appendLine("== Operators by Modality")
             appendLine("")
-            
+
             val operatorsByModality = module.operators.groupBy { it.modality }
             operatorsByModality.forEach { (modality, operators) ->
                 appendLine("=== ${modality.capitalize()}")
                 appendLine("")
                 operators.forEach { operator ->
-                    appendLine("* xref:${operator.name.lowercase()}.adoc[${operator.name}]")
+                    appendLine("* xref:$xrefPrefix${operator.name.lowercase()}.adoc[${operator.name}]")
                 }
                 appendLine("")
             }
         })
     }
+
+    /**
+     * If [outputDir] lives under an Antora `modules/<name>/pages/...`
+     * tree, return the path segment from `pages/` down to the output
+     * directory, suffixed with `/`. Otherwise return an empty string,
+     * so the generator emits bare-filename xrefs (the pre-Antora
+     * behavior).
+     *
+     * Example:
+     * ```
+     * /repo/docs/modules/ROOT/pages/reference/operators/generated
+     *                                → "reference/operators/generated/"
+     * /repo/docs/operators/generated → ""
+     * ```
+     */
+    private fun deriveAntoraXrefPrefix(outputDir: File): String {
+        val path = outputDir.absolutePath.replace(File.separatorChar, '/')
+        val marker = "/pages/"
+        val idx = path.indexOf(marker)
+        if (idx < 0) return ""
+        val tail = path.substring(idx + marker.length)
+        return if (tail.isEmpty()) "" else "$tail/"
+    }
     
     private fun generateOperatorPage(operator: OperatorDoc, module: OperatorDocModule, outputDir: File) {
         val operatorFile = File(outputDir, "${operator.name.lowercase()}.adoc")
diff --git a/build.gradle.kts b/build.gradle.kts
index 27e43398..0df84bb7 100644
--- a/build.gradle.kts
+++ b/build.gradle.kts
@@ -100,10 +100,12 @@ tasks.register("generateOperatorDocs") {
     }
 }
 
-// Documentation plugin configuration
+// Documentation plugin configuration — emits operator doc fragments
+// into the Antora ROOT module so the published site can surface them
+// under Reference > Operator coverage.
 documentation {
     inputFile.set(file("skainet-lang/skainet-lang-core/build/generated/ksp/metadata/commonMain/resources/operators.json"))
-    outputDirectory.set(file("docs/modules/operators/_generated_"))
+    outputDirectory.set(file("docs/modules/ROOT/pages/reference/operators/generated"))
     includeBackendStatus.set(true)
     generateIndex.set(true)
 }
@@ -153,4 +155,17 @@ dependencies {
     // Other
     dokka(project(":skainet-pipeline"))
     dokka(project(":skainet-models:skainet-model-yolo"))
+}
+
+// Copy the Dokka-generated HTML aggregate into the Antora site
+// output as a sibling `/api/` path. Invoked by .github/workflows/docs.yml
+// AFTER Antora has populated `docs/build/site/`; intentionally NOT
+// wired into the `build` lifecycle so that running `./gradlew build`
+// locally never silently creates a half-populated site directory.
+tasks.register<Copy>("bundleDokkaIntoSite") {
+    group = "documentation"
+    description = "Copy build/dokka/html into docs/build/site/api for GitHub Pages publish"
+    dependsOn("dokkaGenerate")
+    from(layout.buildDirectory.dir("dokka/html"))
+    into(layout.projectDirectory.dir("docs/build/site/api"))
 }
\ No newline at end of file
diff --git a/docs/.docker/Dockerfile b/docs/.docker/Dockerfile
new file mode 100644
index 00000000..67c21ba6
--- /dev/null
+++ b/docs/.docker/Dockerfile
@@ -0,0 +1,37 @@
+FROM node:20-alpine
+
+LABEL org.opencontainers.image.title="SKaiNET Antora" \
+      org.opencontainers.image.description="Antora site generator with built-in Mermaid rendering" \
+      org.opencontainers.image.source="https://github.com/SKaiNET-developers/SKaiNET-transformers"
+
+# Chromium for mermaid-cli (puppeteer)
+RUN apk add --no-cache chromium font-noto
+
+ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser \
+    PUPPETEER_SKIP_DOWNLOAD=true
+
+# Install Antora + extensions to /opt/antora (not /antora which gets volume-mounted)
+WORKDIR /opt/antora
+RUN npm init -y && npm i --save-exact \
+      @antora/cli@3.1 \
+      @antora/site-generator@3.1 \
+      asciidoctor-kroki@0.18 \
+      @mermaid-js/mermaid-cli@11 \
+    && npm cache clean --force
+
+# Make installed modules visible when workdir is the mounted project
+ENV NODE_PATH=/opt/antora/node_modules
+
+# Mermaid-cli config
+RUN echo '{ \
+  "executablePath": "/usr/bin/chromium-browser", \
+  "args": ["--no-sandbox", "--disable-gpu", "--disable-dev-shm-usage"] \
+}' > /opt/antora/puppeteer-config.json
+
+# Verify mermaid works
+RUN echo 'graph TD; A-->B;' > /tmp/test.mmd \
+    && npx mmdc -i /tmp/test.mmd -o /tmp/test.svg -p /opt/antora/puppeteer-config.json \
+    && rm /tmp/test.mmd /tmp/test.svg
+
+ENTRYPOINT ["/opt/antora/node_modules/.bin/antora"]
+CMD ["--stacktrace", "antora-playbook.yml"]
diff --git a/docs/antora-playbook.yml b/docs/antora-playbook.yml
new file mode 100644
index 00000000..4c7b9bca
--- /dev/null
+++ b/docs/antora-playbook.yml
@@ -0,0 +1,26 @@
+site:
+  title: SKaiNET
+  start_page: skainet::index.adoc
+
+content:
+  sources:
+    - url: /antora
+      start_path: docs
+      branches: HEAD
+
+asciidoc:
+  extensions:
+    - asciidoctor-kroki
+  attributes:
+    # Use local mermaid-cli via Kroki (no external server needed when
+    # built with the custom Docker image in docs/.docker/Dockerfile —
+    # copied verbatim from SKaiNET-transformers).
+    kroki-fetch-diagram: true
+
+ui:
+  bundle:
+    url: https://gitlab.com/antora/antora-ui-default/-/jobs/artifacts/HEAD/raw/build/ui-bundle.zip?job=bundle-stable
+    snapshot: true
+
+output:
+  dir: ./build/site
diff --git a/docs/antora.yml b/docs/antora.yml
new file mode 100644
index 00000000..05bf9566
--- /dev/null
+++ b/docs/antora.yml
@@ -0,0 +1,5 @@
+name: skainet
+title: SKaiNET
+version: ~
+nav:
+  - modules/ROOT/nav.adoc
diff --git a/docs/arduino-c-codegen.md b/docs/arduino-c-codegen.md
deleted file mode 100644
index 5bc9eda8..00000000
--- a/docs/arduino-c-codegen.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Arduino C Code Generation
-
-SKaiNET provides a specialized compiler backend for exporting trained neural networks to highly optimized, standalone C99 code suitable for microcontrollers like Arduino.
-
-## Overview
-
-The Arduino C code generation process transforms a high-level Kotlin model into a memory-efficient C implementation. It prioritizes static memory allocation, minimal overhead, and numerical consistency with the original model.
-
-### Codegen Pipeline
-
-```mermaid
-graph TD
-    A[Kotlin Model] --> B[Recording Pass]
-    B --> C[Execution Tape]
-    C --> D[Compute Graph]
-    D --> E[Graph Validation]
-    E --> F[Memory Layout Calculation]
-    F --> G[C Code Emission]
-    G --> H[Arduino Library Packaging]
-    H --> I[Generated .h/.c files]
-```
-
-## Technical Deep Dive
-
-### 1. Tape-based Tracing
-Instead of static analysis of the Kotlin code, SKaiNET uses a dynamic tracing mechanism. When you call `exportToArduinoLibrary`, the framework executes a single forward pass of your model using a specialized `RecordingContext`.
-- Every operation (Dense, ReLU, etc.) is recorded onto an **Execution Tape**.
-- This approach handles Kotlin's language features (loops, conditionals) naturally, as it only records the actual operations that were executed.
-
-### 2. Compute Graph Construction
-The execution tape is converted into a directed acyclic graph (DAG) called `ComputeGraph`.
-- Nodes represent operations (Ops).
-- Edges represent data flow (Tensors).
-- During this phase, the compiler performs **Shape Inference** to ensure every tensor has a fixed, known size.
-
-### 3. Static Memory Management
-Microcontrollers typically have very limited RAM and lack robust heap management. SKaiNET uses a **Ping-Pong Buffer Strategy** to eliminate dynamic memory allocation (`malloc`/`free`) during inference.
-
-#### Ping-Pong Buffer Strategy
-The compiler calculates the maximum size required for any intermediate tensor in the graph and allocates exactly two static buffers of that size.
-
-```mermaid
-sequenceDiagram
-    participant I as Input
-    participant B1 as Buffer A
-    participant B2 as Buffer B
-    participant O as Output
-    
-    I->>B1: Layer 1 (Input -> A)
-    B1->>B2: Layer 2 (A -> B)
-    B2->>B1: Layer 3 (B -> A)
-    B1->>O: Layer 4 (A -> Output)
-```
-
-- **Buffer Reuse**: Instead of allocating space for every layer's output, buffers are reused.
-- **Direct Output Optimization**: The first layer reads from the input pointer, and the last layer writes directly to the output pointer, avoiding unnecessary copies.
-
-### 4. Code Generation (Emission)
-The `CCodeGenerator` emits C99-compatible code using templates.
-- **Weights & Biases**: Extracted from the trained Kotlin model and serialized as `static const float` arrays. This places them in Flash memory (PROGMEM) on many microcontrollers, saving precious RAM.
-- **Kernel Implementation**: Operations like `Dense` (Linear) are implemented as optimized nested loops.
-- **Header Generation**: Produces a clean API for the user:
-  ```c
-  int model_inference(const float* input, float* output);
-  ```
-
-### 5. Validation
-The generator performs post-generation validation:
-- **Static Allocation Check**: Ensures no dynamic allocation is present in the generated source.
-- **Buffer Alternation Check**: Verifies that the ping-pong strategy is correctly implemented without data races or overwrites.
-
-## Performance and Constraints
-- **Floating Point**: Currently optimized for `FP32`.
-- **Supported Ops**: `Dense`, `ReLU`, `Sigmoid`, `Tanh`, `Add`, `MatMul`.
-- **Memory**: Total memory consumption is `TotalWeights + 2 * MaxIntermediateTensor`.
diff --git a/docs/build_help.md b/docs/build_help.md
deleted file mode 100644
index b6413e73..00000000
--- a/docs/build_help.md
+++ /dev/null
@@ -1,81 +0,0 @@
-# Build Help
-
-## Dokka API Documentation
-
-SKaiNET uses [Dokka 2.1.0](https://github.com/Kotlin/dokka) to generate API reference documentation across all public library modules. A shared convention plugin (`sk.ainet.dokka`) standardises the configuration.
-
-### Generating docs locally
-
-**Single module:**
-
-```bash
-./gradlew :skainet-lang:skainet-lang-core:dokkaGeneratePublicationHtml
-```
-
-Output: `skainet-lang/skainet-lang-core/build/dokka/html/`
-
-**Aggregated (all modules):**
-
-```bash
-./gradlew dokkaGenerate
-```
-
-Output: `build/dokka/html/index.html`
-
-### Convention plugin details
-
-The `sk.ainet.dokka` precompiled script plugin (`build-logic/convention/src/main/kotlin/sk.ainet.dokka.gradle.kts`) applies `org.jetbrains.dokka` and configures:
-
-- **moduleName** from `project.name`
-- **moduleVersion** from the `VERSION_NAME` Gradle property
-- **Documented visibilities:** public only
-- **Suppressed generated files:** KSP-generated code is excluded
-- **Suppressed native source sets:** `iosArm64Main`, `iosSimulatorArm64Main`, `macosArm64Main`, `linuxX64Main`, `linuxArm64Main` are suppressed because Dokka 2.x cannot translate native cinterop symbols
-- **Source links** pointing to the GitHub repository
-
-### Modules with Dokka enabled
-
-The plugin is applied to 21 library modules:
-
-| Group | Modules |
-|-------|---------|
-| skainet-lang | `skainet-lang-core`, `skainet-lang-models`, `skainet-lang-ksp-annotations`, `skainet-lang-dag` |
-| skainet-compile | `skainet-compile-core`, `skainet-compile-dag`, `skainet-compile-json`, `skainet-compile-hlo`, `skainet-compile-c` |
-| skainet-backends | `skainet-backend-cpu` |
-| skainet-data | `skainet-data-api`, `skainet-data-transform`, `skainet-data-simple`, `skainet-data-media` |
-| skainet-io | `skainet-io-core`, `skainet-io-gguf`, `skainet-io-image`, `skainet-io-onnx`, `skainet-io-safetensors` |
-| Other | `skainet-pipeline`, `skainet-model-yolo` |
-
-**Excluded:** `skainet-bom` (no source), `skainet-apps/*`, `skainet-test/*`, benchmarks, and `skainet-lang-ksp-processor` (internal).
-
-### Root-level aggregation
-
-The root `build.gradle.kts` applies the Dokka plugin directly (not `apply false`) and declares `dokka(project(...))` dependencies for all 21 modules. Running `./gradlew dokkaGenerate` at the root produces a unified API reference that includes every module under a single `SKaiNET` namespace. The root `README.md` is included as the landing page.
-
-### KSP interaction
-
-`skainet-lang-core` and `skainet-lang-dag` use KSP to generate source code. Their build files include:
-
-```kotlin
-tasks.matching { it.name.startsWith("dokka") }.configureEach {
-    dependsOn("kspCommonMainKotlinMetadata")
-}
-```
-
-This ensures KSP-generated sources are available before Dokka runs.
-
-### GitHub Pages deployment
-
-The workflow `.github/workflows/dokka-pages.yml` runs on push to `main` (and manually via `workflow_dispatch`). It:
-
-1. Checks out the repo
-2. Sets up JDK 25
-3. Runs `./gradlew dokkaGenerate`
-4. Uploads the `build/dokka/html` directory as a Pages artifact
-5. Deploys to GitHub Pages using `actions/deploy-pages@v4`
-
-**Prerequisite:** The repository must have Pages configured to deploy from GitHub Actions (Settings > Pages > Source: "GitHub Actions").
-
-### Operator docs (unchanged)
-
-The existing operator documentation pipeline (`./gradlew generateDocs`) is unrelated to Dokka and continues to work as before.
diff --git a/docs/kllama-getting-started.md b/docs/kllama-getting-started.md
deleted file mode 100644
index 7e7fb8e9..00000000
--- a/docs/kllama-getting-started.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# KLlama Getting Started
-
-KLlama is a pure Kotlin LLaMA inference runtime that runs on JVM, Native, JS, and WebAssembly. It supports GGUF, SafeTensors, and Karpathy .bin model formats with on-the-fly quantization support.
-
-> **Early Stage Development**: The project is in active development. We appreciate your feedback and bug reports!
-
-## Choose Your Path
-
-| Goal | Guide |
-|---|---|
-| **Run models from the command line** | [KLlama CLI](../skainet-apps/skainet-kllama-cli/README.md) |
-| **Embed in a Kotlin application** | [KLlama Library](../skainet-apps/skainet-kllama/README.md) |
-| **Embed in a Java application** | [Java LLM Inference Guide](java-llm-inference.md) |
-| **Build a standalone Java CLI app** | [Java CLI App Guide](java-cli-app.md) |
-| **Java project setup (Maven / Gradle)** | [Java Getting Started](java-getting-started.md) |
-
-## Quick Links
-
-- [Supported formats & quantization](../skainet-apps/skainet-kllama/README.md#supported-formats--quantization)
-- [Custom backend integration](../skainet-apps/skainet-kllama/README.md#custom-backend-integration)
-- [Agent & tool calling](java-llm-inference.md#agent-loop-and-tool-calling)
-- [BERT embeddings & similarity](java-llm-inference.md#bert-encoding-and-similarity)
diff --git a/docs/SKaiNET-compiler.svg b/docs/modules/ROOT/images/SKaiNET-compiler.svg
similarity index 100%
rename from docs/SKaiNET-compiler.svg
rename to docs/modules/ROOT/images/SKaiNET-compiler.svg
diff --git a/docs/SKaiNET-logo.png b/docs/modules/ROOT/images/SKaiNET-logo.png
similarity index 100%
rename from docs/SKaiNET-logo.png
rename to docs/modules/ROOT/images/SKaiNET-logo.png
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
new file mode 100644
index 00000000..70418739
--- /dev/null
+++ b/docs/modules/ROOT/nav.adoc
@@ -0,0 +1,32 @@
+* xref:index.adoc[Overview]
+
+.Tutorials
+* xref:tutorials/java-getting-started.adoc[Java getting started]
+* xref:tutorials/kllama-getting-started.adoc[KLlama getting started]
+* xref:tutorials/hlo-getting-started.adoc[StableHLO getting started]
+* xref:tutorials/graph-dsl.adoc[Graph DSL]
+
+.How-to guides
+* xref:how-to/build.adoc[Build from source]
+* xref:how-to/io-readers.adoc[Load models (GGUF, SafeTensors, ONNX)]
+* xref:how-to/java-cli-app.adoc[Build a Java CLI app]
+* xref:how-to/java-llm-inference.adoc[Run LLM inference]
+* xref:how-to/java-model-training.adoc[Train a model]
+* xref:how-to/arduino-c-codegen.adoc[Generate C for Arduino]
+
+.Reference
+* xref:reference/architecture.adoc[Architecture]
+* xref:reference/operators/generated/index.adoc[Operator reference]
+* xref:reference/ops-status-matrix.adoc[Operator coverage matrix]
+* xref:reference/api.adoc[API reference (Dokka)]
+
+.Explanation
+* xref:explanation/skainet-for-ai.adoc[SKaiNET for AI/ML]
+* xref:explanation/operator-design.adoc[Operator documentation system]
+* xref:explanation/theory/index.adoc[Mathematical theory]
+** xref:explanation/theory/matmul.adoc[Matrix multiplication]
+* xref:explanation/examples/index.adoc[Worked examples]
+** xref:explanation/examples/matmul.adoc[Matrix multiplication examples]
+* xref:explanation/perf/jvm-cpu.adoc[JVM CPU performance]
+* xref:explanation/perf/java-25-cpu-backend.adoc[Java 25 CPU backend notes]
+* xref:explanation/issues/native-macos-accelerate-simd.adoc[Native macOS Accelerate SIMD issues]
diff --git a/docs/examples/index.adoc b/docs/modules/ROOT/pages/explanation/examples/index.adoc
similarity index 87%
rename from docs/examples/index.adoc
rename to docs/modules/ROOT/pages/explanation/examples/index.adoc
index 97630bec..36946d7d 100644
--- a/docs/examples/index.adoc
+++ b/docs/modules/ROOT/pages/explanation/examples/index.adoc
@@ -7,7 +7,7 @@ This section contains practical examples and usage patterns for SKaiNET operator
 
 === Linear Algebra
 
-include::matmul-examples.adoc[leveloffset=+2]
+include::matmul.adoc[leveloffset=+2]
 
 === Tensor Creation and Manipulation
 
@@ -53,5 +53,5 @@ include::matmul-examples.adoc[leveloffset=+2]
 [#cross-references]
 == Cross-References
 
-* xref:../theory/index.adoc[Mathematical Theory]
-* xref:../modules/operators/_generated_/index.adoc[Generated API Reference]
\ No newline at end of file
+* xref:explanation/theory/index.adoc[Mathematical Theory]
+// Operator reference lands in a later commit of the Antora migration.
\ No newline at end of file
diff --git a/docs/examples/matmul-examples.adoc b/docs/modules/ROOT/pages/explanation/examples/matmul.adoc
similarity index 100%
rename from docs/examples/matmul-examples.adoc
rename to docs/modules/ROOT/pages/explanation/examples/matmul.adoc
diff --git a/docs/issues/native-macos-accelerate-simd.md b/docs/modules/ROOT/pages/explanation/issues/native-macos-accelerate-simd.adoc
similarity index 51%
rename from docs/issues/native-macos-accelerate-simd.md
rename to docs/modules/ROOT/pages/explanation/issues/native-macos-accelerate-simd.adoc
index b0317c92..4fa01b33 100644
--- a/docs/issues/native-macos-accelerate-simd.md
+++ b/docs/modules/ROOT/pages/explanation/issues/native-macos-accelerate-simd.adoc
@@ -1,6 +1,6 @@
-# Native macOS SIMD acceleration via Apple Accelerate framework
+== Native macOS SIMD acceleration via Apple Accelerate framework
 
-## Problem
+=== Problem
 
 The `skainet-backend-cpu` module on Kotlin/Native macOS (macosArm64) uses plain scalar loops
 for all tensor operations (`DefaultCpuOps`). On JVM, the same module uses the JDK Vector API
@@ -11,71 +11,76 @@ When running LLM inference benchmarks via the `llm-performance` native binary, t
 is 5-10x slower than it needs to be because every matmul is a triple-nested scalar loop
 (`DefaultCpuOps.kt:264-272`).
 
-## Proposed solution
+=== Proposed solution
 
 Add an Accelerate-backed `TensorOps` implementation for the macOS native target, mirroring
 how the JVM target has `DefaultCpuOpsJvm`. Apple's Accelerate framework provides
 hardware-optimized BLAS and vector DSP routines that leverage ARM NEON and AMX under the hood.
 
-### Architecture
+==== Architecture
 
-```
+....
 PlatformCpuOpsFactory
   ├── jvmMain   → DefaultCpuOpsJvm (Vector API + optional BLAS)     ← exists
   ├── nativeMain → DefaultCpuOps (scalar fallback)                   ← exists
   ├── macosMain  → AccelerateCpuOps (Accelerate framework via cinterop)  ← NEW
   └── linuxMain  → DefaultCpuOps (scalar, or OpenBLAS in future)    ← unchanged
-```
+....
 
-### Key changes
+==== Key changes
 
-**1. Cinterop definition** — `src/nativeInterop/cinterop/accelerate.def`
+*1. Cinterop definition* — `src/nativeInterop/cinterop/accelerate.def`
 
-```def
+[source,def]
+----
 package = platform.accelerate
 language = C
 headers = Accelerate/Accelerate.h
 compilerOpts = -framework Accelerate
 linkerOpts = -framework Accelerate
-```
+----
 
-**2. New class** — `src/macosMain/kotlin/.../AccelerateCpuOps.kt`
+*2. New class* — `src/macosMain/kotlin/.../AccelerateCpuOps.kt`
 
 Extends `DefaultCpuOps` and overrides hot-path operations with Accelerate calls:
 
-| Priority | Operation | Accelerate function | Impact |
-|----------|-----------|---------------------|--------|
-| P0 | `matmul` | `cblas_sgemm` | Dominant cost in LLM inference (~90% of forward pass) |
-| P1 | `add` | `vDSP_vadd` | Elementwise add (residual connections) |
-| P1 | `multiply` | `vDSP_vmul` | Elementwise multiply (gates, scaling) |
-| P1 | `subtract` | `vDSP_vsub` | Elementwise subtract |
-| P1 | `divide` | `vDSP_vdiv` | Elementwise divide |
-| P2 | `sum` (global) | `vDSP_sve` | Reduction for normalization |
-| P2 | `mean` (global) | `vDSP_meanv` | Reduction for normalization |
-| P2 | `softmax` | `vDSP_vse` + manual | Attention weights |
-| P3 | `relu` | `vDSP_vthres` / `vDSP_vthr` | Activation function |
-| P3 | `silu` | manual vectorized loop | Activation function (SiLU = x * sigmoid(x)) |
-| P3 | `transpose` | `vDSP_mtrans` | Matrix transpose |
-
-**3. Platform factory** — update `PlatformCpuOpsFactory` for macOS
-
-```kotlin
+[cols=",,,",options="header",]
+|===
+|Priority |Operation |Accelerate function |Impact
+|P0 |`matmul` |`cblas++_++sgemm` |Dominant cost in LLM inference (~90% of forward pass)
+|P1 |`add` |`vDSP++_++vadd` |Elementwise add (residual connections)
+|P1 |`multiply` |`vDSP++_++vmul` |Elementwise multiply (gates, scaling)
+|P1 |`subtract` |`vDSP++_++vsub` |Elementwise subtract
+|P1 |`divide` |`vDSP++_++vdiv` |Elementwise divide
+|P2 |`sum` (global) |`vDSP++_++sve` |Reduction for normalization
+|P2 |`mean` (global) |`vDSP++_++meanv` |Reduction for normalization
+|P2 |`softmax` |`vDSP++_++vse` {plus} manual |Attention weights
+|P3 |`relu` |`vDSP++_++vthres` / `vDSP++_++vthr` |Activation function
+|P3 |`silu` |manual vectorized loop |Activation function (SiLU = x ++*++ sigmoid(x))
+|P3 |`transpose` |`vDSP++_++mtrans` |Matrix transpose
+|===
+
+*3. Platform factory* — update `PlatformCpuOpsFactory` for macOS
+
+[source,kotlin]
+----
 // src/macosMain/kotlin/.../PlatformCpuOpsFactory.macos.kt
 internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps {
     println("[SKaiNET] Using Accelerate-backed CPU operations (ARM NEON + AMX)")
     return { factory -> AccelerateCpuOps(factory) }
 }
-```
+----
 
 This requires splitting the current `nativeMain` expect/actual into separate
 `macosMain` and `linuxMain` actuals (the `macosMain` source set already exists in
 `build.gradle.kts`).
 
-**4. Build changes** — `build.gradle.kts`
+*4. Build changes* — `build.gradle.kts`
 
 Add cinterop configuration for macosArm64 (and optionally iosArm64/iosSimulatorArm64):
 
-```kotlin
+[source,kotlin]
+----
 macosArm64 {
     compilations["main"].cinterops {
         val accelerate by creating {
@@ -83,43 +88,43 @@ macosArm64 {
         }
     }
 }
-```
+----
 
 Add linker opts for the Accelerate framework to all macOS/iOS binaries.
 
-### Implementation notes
+==== Implementation notes
 
-- `AccelerateCpuOps` should extend `DefaultCpuOps` and override only the operations above.
-  Non-accelerated operations fall through to the scalar implementation.
-- The `matmul` override should handle 2D FP32 tensors with `cblas_sgemm` and delegate
-  batched/non-float cases to `super.matmul()`.
-- `vDSP_*` functions operate on contiguous `FloatArray` buffers. Tensors backed by
-  `FloatArrayTensorData` can be passed directly; others need a `toFloatArray()` copy.
-- Broadcasting logic (e.g., bias add, scalar multiply) should remain in the Kotlin layer
-  and only dispatch the contiguous inner loop to Accelerate.
-- The same approach works for iOS targets (`iosArm64`, `iosSimulatorArm64`) since
-  Accelerate is available on all Apple platforms.
+* `AccelerateCpuOps` should extend `DefaultCpuOps` and override only the operations above.
+Non-accelerated operations fall through to the scalar implementation.
+* The `matmul` override should handle 2D FP32 tensors with `cblas++_++sgemm` and delegate
+batched/non-float cases to `super.matmul()`.
+* `vDSP++_*++` functions operate on contiguous `FloatArray` buffers. Tensors backed by
+`FloatArrayTensorData` can be passed directly; others need a `toFloatArray()` copy.
+* Broadcasting logic (e.g., bias add, scalar multiply) should remain in the Kotlin layer
+and only dispatch the contiguous inner loop to Accelerate.
+* The same approach works for iOS targets (`iosArm64`, `iosSimulatorArm64`) since
+Accelerate is available on all Apple platforms.
 
-### Testing
+==== Testing
 
-- Existing `DefaultCpuOps` tests in `commonTest` should pass unchanged (numerical equivalence).
-- Add macOS-specific tests verifying Accelerate dispatch actually occurs (e.g., check log output
-  or add a query method).
-- Benchmark comparison: run `llm-performance` native benchmark with the current scalar backend
-  vs Accelerate backend on the same model.
+* Existing `DefaultCpuOps` tests in `commonTest` should pass unchanged (numerical equivalence).
+* Add macOS-specific tests verifying Accelerate dispatch actually occurs (e.g., check log output
+or add a query method).
+* Benchmark comparison: run `llm-performance` native benchmark with the current scalar backend
+vs Accelerate backend on the same model.
 
-### Expected impact
+==== Expected impact
 
 Based on JVM BLAS vs scalar measurements and Apple's published Accelerate performance data:
 
-- **matmul**: 10-50x speedup (NEON + AMX vs scalar loop)
-- **elementwise**: 4-8x speedup (NEON vectorization)
-- **reductions**: 4-8x speedup (NEON vectorization)
-- **overall LLM inference**: 5-20x speedup on native macOS CPU backend
+* *matmul*: 10-50x speedup (NEON {plus} AMX vs scalar loop)
+* *elementwise*: 4-8x speedup (NEON vectorization)
+* *reductions*: 4-8x speedup (NEON vectorization)
+* *overall LLM inference*: 5-20x speedup on native macOS CPU backend
 
-### Files to create/modify
+==== Files to create/modify
 
-```
+....
 skainet-backends/skainet-backend-cpu/
 ├── build.gradle.kts                                          # add cinterop
 ├── src/nativeInterop/cinterop/accelerate.def                 # NEW
@@ -127,12 +132,12 @@ skainet-backends/skainet-backend-cpu/
 ├── src/macosMain/kotlin/.../PlatformCpuOpsFactory.macos.kt   # NEW
 ├── src/linuxMain/kotlin/.../PlatformCpuOpsFactory.linux.kt   # NEW (move from nativeMain)
 └── src/nativeMain/kotlin/.../PlatformCpuOpsFactory.native.kt # REMOVE (split to platform-specific)
-```
+....
 
-### References
+==== References
 
-- JVM SIMD implementation: `src/jvmMain/kotlin/.../DefaultCpuOpsJvm.kt`
-- JVM BLAS integration: `src/jvmMain/kotlin/.../JvmBlas.kt`
-- Apple Accelerate docs: https://developer.apple.com/documentation/accelerate
-- CBLAS reference: https://developer.apple.com/documentation/accelerate/blas
-- vDSP reference: https://developer.apple.com/documentation/accelerate/vdsp
+* JVM SIMD implementation: `src/jvmMain/kotlin/.../DefaultCpuOpsJvm.kt`
+* JVM BLAS integration: `src/jvmMain/kotlin/.../JvmBlas.kt`
+* Apple Accelerate docs: https://developer.apple.com/documentation/accelerate
+* CBLAS reference: https://developer.apple.com/documentation/accelerate/blas
+* vDSP reference: https://developer.apple.com/documentation/accelerate/vdsp
diff --git a/docs/ops-docs.adoc b/docs/modules/ROOT/pages/explanation/operator-design.adoc
similarity index 96%
rename from docs/ops-docs.adoc
rename to docs/modules/ROOT/pages/explanation/operator-design.adoc
index faa5d75c..6ce1d8d4 100644
--- a/docs/ops-docs.adoc
+++ b/docs/modules/ROOT/pages/explanation/operator-design.adoc
@@ -101,7 +101,7 @@ Your article must be written in AsciiDoc and include the following sections (use
 - Show how fragments embed:
   • An API signature block
   • A status table by backend
-  • Pointers (xref::) to human-written math/semantics sections
+  • Pointers (`xref:`) to human-written math/semantics sections
 
 - Provide example AsciiDoc fragment:
   [source,adoc]
@@ -126,7 +126,7 @@ Your article must be written in AsciiDoc and include the following sections (use
   See xref:theory/matmul.adoc#definition[MatMul semantics] and xref:examples/matmul.adoc#examples[Examples].
   ----
 
-- Demonstrate combining generated and human-written docs via include:: and xref::, with a small folder layout:
+- Demonstrate combining generated and human-written docs via `include::` and `xref:`, with a small folder layout:
   [source,text]
   ----
   docs/
@@ -147,7 +147,7 @@ Your article must be written in AsciiDoc and include the following sections (use
   • Human-written caveats that reference generated statuses via xref anchors.
 - Show a synchronization flow as a Mermaid diagram:
 
-  [source,mermaid]
+  [mermaid]
   ----
   flowchart LR
     A[Operator Interfaces (KMP)] --> B[KSP Processor]
@@ -226,7 +226,7 @@ Your article must be written in AsciiDoc and include the following sections (use
   ----
 
 - Show the KSP-produced JSON excerpt and the corresponding generated AsciiDoc fragment for at least one function (e.g., relu).
-- Give a minimal human-written math section for MatMul (dimensions, shapes, complexity), and show how it’s included via xref:: from the generated fragment.
+- Give a minimal human-written math section for MatMul (dimensions, shapes, complexity), and show how it is included via `xref:` from the generated fragment.
 
 == 8. Summary and Benefits
 - Summarize benefits:
@@ -246,7 +246,7 @@ APPENDIX (OPTIONAL BUT STRONGLY RECOMMENDED)
 
 OUTPUT FORMAT REQUIREMENTS
 - Write the entire article as **AsciiDoc**.
-- Use code blocks with language tags: [source,kotlin], [source,gradle], [source,json], [source,adoc], [source,mermaid], [source,plantuml] (PlantUML optional).
+- Use code blocks with language tags: [source,kotlin], [source,gradle], [source,json], [source,adoc], [mermaid], [source,plantuml] (PlantUML optional).
 - Use short paragraphs and bullet lists; avoid filler or marketing language.
 - Include at least:
   • One Mermaid diagram (the pipeline).
@@ -260,7 +260,7 @@ ACCEPTANCE CHECKLIST (the output must satisfy all)
 - [ ] Clear definition of “reflective documentation” and how it differs from classic docgen.
 - [ ] KSP plan, annotation semantics, and JSON Schema included.
 - [ ] Example Operator (TensorOps) + generated metadata + generated AsciiDoc fragment.
-- [ ] Demonstrated include:: and xref:: usage.
+- [ ] Demonstrated `include::` and `xref:` usage.
 - [ ] Mermaid pipeline diagram present.
 - [ ] Gradle/Dokka/AsciiDoctorJ integration details with code.
 - [ ] Summary articulates benefits and risks.
\ No newline at end of file
diff --git a/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc b/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc
new file mode 100644
index 00000000..2b74c01c
--- /dev/null
+++ b/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc
@@ -0,0 +1,111 @@
+==== Java 25 Advantages for the JVM CPU Backend
+
+Java 25 (GA September 2025) delivers significant free performance improvements to the
+SKaiNET JVM CPU backend through JIT/C2 optimizations, faster Panama FFI, and new GC/startup
+features — all without requiring code changes.
+
+===== Compatibility
+
+The same code, same flags, and same runtime detection work across JDK 21–25:
+
+* Vector API remains incubator on JDK 25 (JEP 508) — identical `jdk.incubator.vector` package.
+* Panama FFI finalized in JDK 22; `--enable-preview` is harmless on 22{plus}.
+* Runtime detection (`Class.forName`, `Runtime.version()`) works on all versions.
+* Build config (`jvmTarget = JVM++_++21`, `options.release.set(21)`) produces compatible bytecode.
+
+*No special treatment is needed for JDK ++>++= 21 but ++<++ 25.*
+
+Required flags remain:
+
+....
+--enable-preview --add-modules jdk.incubator.vector
+....
+
+[[jit--c2-improvements-mapped-to-skainet-ops]]
+===== JIT / C2 improvements mapped to SKaiNET ops
+
+These are automatic — the JIT produces better native code for existing bytecode.
+
+[cols=",,,",options="header",]
+|===
+|Improvement |JDK bug |Speedup |Affected SKaiNET code
+|VPointer refactoring for vector loads/stores |https://bugs.openjdk.org/browse/JDK-8350748[JDK-8350748] |up to 14x |All `FloatVector.fromArray` / `fromMemorySegment` loops in `JvmVectorKernels.kt`, `JvmQuantizedVectorKernels.kt`
+|SuperWord SIMD enhancement |https://bugs.openjdk.org/browse/JDK-8343685[JDK-8343685] |up to 33x |Same vectorized loops (elementwise, reductions, matmul inner loops)
+|`Math.max` / `Math.min` intrinsified for `long` |JDK-8350485 |3–5x |Shape computation, tile clamping in blocked matmul
+|===
+
+Source files:
+
+* `skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmVectorKernels.kt`
+* `skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt`
+
+===== Panama FFI improvements
+
+[cols=",,,",options="header",]
+|===
+|Improvement |JDK bug |Speedup |Affected SKaiNET code
+|Faster `MemorySegment` allocation |https://bugs.openjdk.org/browse/JDK-8345687[JDK-8345687] |~2x |`MemorySegmentTensorData.kt` (`MemorySegmentTensorDataFactory`), `PagedKvCache.kt`
+|`MemorySegment::fill` optimized on AArch64 |https://bugs.openjdk.org/browse/JDK-8354674[JDK-8354674] |~2.5x |Tensor zeroing, blocked matmul result initialization
+|===
+
+Source files:
+
+* `skainet-lang/skainet-lang-core/src/jvmMain/kotlin/sk/ainet/lang/tensor/data/MemorySegmentTensorData.kt`
+* `skainet-apps/skainet-kllama/src/jvmMain/kotlin/sk/ainet/apps/kllama/PagedKvCache.kt`
+
+===== Object layout and GC
+
+* *Compact Object Headers* (JEP 519) — reduces object header from 12 to 8 bytes.
+Meaningful for tensor metadata arrays with millions of small objects.
+Opt-in: `-XX:{plus}UseCompactObjectHeaders`
+* *Generational Shenandoah* (JEP 521) — lower GC pause times for allocation-heavy
+workloads (tensor creation, KV cache churn).
+Opt-in: `-XX:{plus}UseShenandoahGC -XX:ShenandoahGCMode=generational`
+
+===== Startup and warmup
+
+* *AOT profiling / caching* (JEP 515) — records JIT profile data from a training run
+and replays it on subsequent launches. Reduces warmup by 15–25%.
+Useful for CLI apps like kLLaMA where first-token latency matters.
+
+Usage:
+
+....
+# Training run (records profile)
+java -XX:AOTCacheOutput=app.aot -jar kllama.jar --prompt "warmup"
+
+# Production run (replays profile)
+java -XX:AOTCache=app.aot -jar kllama.jar --prompt "Hello"
+....
+
+===== Recommended JVM flags for Java 25
+
+Required (same as JDK 21–24):
+
+....
+--enable-preview
+--add-modules jdk.incubator.vector
+....
+
+Optional — enable for maximum benefit on JDK 25:
+
+....
+-XX:+UseCompactObjectHeaders
+-XX:+UseShenandoahGC -XX:ShenandoahGCMode=generational
+-XX:AOTCache=app.aot          # after training run
+....
+
+===== Summary
+
+[cols=",,",options="header",]
+|===
+|Feature |Benefit |Component
+|VPointer refactoring (C2) |Up to 14x faster vector loads/stores |`JvmVectorKernels`, `JvmQuantizedVectorKernels`
+|SuperWord SIMD (C2) |Up to 33x faster auto-vectorized loops |Same vector kernel files
+|`Math.max/min` intrinsic |3–5x faster long comparisons |Shape computation, tile clamping
+|Faster segment allocation |~2x allocation throughput |`MemorySegmentTensorDataFactory`, `PagedKvCache`
+|`MemorySegment::fill` (AArch64) |~2.5x faster bulk zeroing |Tensor init, matmul result buffers
+|Compact Object Headers |~30% smaller object headers |All tensor metadata
+|Generational Shenandoah |Lower GC pauses |Allocation-heavy inference
+|AOT profiling |15–25% faster warmup |CLI apps (kLLaMA)
+|===
diff --git a/docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc b/docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc
new file mode 100644
index 00000000..167aac22
--- /dev/null
+++ b/docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc
@@ -0,0 +1,110 @@
+==== JVM CPU Backend Performance Benchmarks (JMH)
+
+This page explains how to run the JMH benchmarks for the JVM CPU backend and how to capture evidence for performance targets.
+
+===== What’s included
+
+* Elementwise: FP32 `add` on 1,000,000 elements
+* Reductions: FP32 `sum` and `mean` on 1,000,000 elements
+* Matmul: FP32 square `matmul` with sizes 256, 512, and 1024
+
+Benchmarks are implemented in module:
+
+* `:skainet-backends:benchmarks:jvm-cpu-jmh`
+
+Source files:
+
+* `src/jmh/kotlin/sk/ainet/bench/ElementwiseAdd1MBench.kt`
+* `src/jmh/kotlin/sk/ainet/bench/Reductions1MBench.kt`
+* `src/jmh/kotlin/sk/ainet/bench/MatmulBench.kt`
+
+===== Prerequisites
+
+* JDK 21{plus} (JDK 22 toolchain configured by Gradle)
+* Gradle will pass required JVM flags:
+** `--enable-preview`
+** `--add-modules jdk.incubator.vector`
+
+For Java 25-specific performance advantages, see link:java-25-cpu-backend.md[Java 25 CPU Backend].
+
+===== Feature flags
+
+You can toggle acceleration paths at runtime using system properties or environment variables:
+
+* Vector acceleration:
+** `-Dskainet.cpu.vector.enabled=true++|++false`
+** or `SKAINET++_++CPU++_++VECTOR++_++ENABLED=true++|++false`
+* BLAS via Panama (matmul heuristic for larger sizes):
+** `-Dskainet.cpu.blas.enabled=true++|++false`
+** or `SKAINET++_++CPU++_++BLAS++_++ENABLED=true++|++false`
+
+Each benchmark also exposes `@Param` to toggle these flags without modifying Gradle args.
+
+===== How to run all benchmarks
+
+From repository root:
+
+....
+./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh
+....
+
+This will build and execute all JMH benchmarks with the default parameters defined in sources.
+
+===== Run specific benchmarks
+
+* Elementwise add (both vector on/off):
+
+....
+./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
+  -Pjmh.include=ElementwiseAdd1MBench
+....
+
+* Reductions (vector on/off):
+
+....
+./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
+  -Pjmh.include=Reductions1MBench
+....
+
+* Matmul, all sizes, with vector on and BLAS on:
+
+....
+./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
+  -Pjmh.include=MatmulBench \
+  -Pjmh.param.vectorEnabled=true \
+  -Pjmh.param.blasEnabled=true
+....
+
+* Matmul at 512 only, comparing BLAS on/off with vector on:
+
+....
+./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
+  -Pjmh.include=MatmulBench \
+  -Pjmh.param.size=512 \
+  -Pjmh.param.vectorEnabled=true \
+  -Pjmh.param.blasEnabled=true,false
+....
+
+Notes:
+
+* You can also pass system properties via `-D` if preferred (e.g., `-Dskainet.cpu.vector.enabled=false`).
+* JMH JSON/text results can be configured via standard JMH plugin options if you need files for CI artifacts.
+
+===== Recording environment details
+
+Include at minimum:
+
+* CPU model, cores/threads, base/boost clock
+* RAM size and speed
+* OS version
+* JDK version and vendor
+* Gradle version
+* JVM flags in use (`--enable-preview --add-modules jdk.incubator.vector`)
+* SKaiNET flags used (vector, BLAS)
+
+===== Performance targets (to be validated on your hardware)
+
+* ≥ 4× speedup on FP32 `matmul` 512×512 vs baseline scalar
+* ≥ 3× speedup on FP32 `add` with 1M elements vs baseline scalar
+
+Use the above commands to produce “vector=false/blas=false” baselines vs “vector=true++[++/blas=true++]++” accelerated runs. Capture best-of or median-of JMH results as evidence and include raw tables in this document when available.
diff --git a/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc b/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc
new file mode 100644
index 00000000..102aa5ac
--- /dev/null
+++ b/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc
@@ -0,0 +1,143 @@
+[[skainet-core-technology-tensor--data-guide]]
+== SKaiNET Core Technology: Tensor & Data Guide
+
+This document provides technical instructions for AI agents and developers on using SKaiNET's Tensor and Data API as a modern, type-safe replacement for NDArray or Python's NumPy library.
+
+[[1-fundamental-architecture-tensor-composition]]
+=== 1. Fundamental Architecture: Tensor Composition
+
+Unlike traditional libraries where a Tensor is a monolithic object, SKaiNET adopts a *compositional architecture*. A `Tensor++<++T, V++>++` is composed of two primary components:
+
+[arabic]
+. *`TensorData++<++T, V++>++`*: Handles multi-dimensional storage, memory layout, indexing, and type-safe element access.
+. *`TensorOps`*: Encapsulates mathematical algorithms and transformations (CPU, GPU, etc.).
+
+This separation allows for high flexibility, such as switching execution backends without changing the data representation.
+
+[source,kotlin]
+----
+interface Tensor<T : DType, V> {
+    val data: TensorData<T, V>
+    val ops: TensorOps
+    val dtype: KClass<T>
+    val shape: Shape
+}
+----
+
+[[2-type-safe-tensor-creation-dsl]]
+=== 2. Type-Safe Tensor Creation (DSL)
+
+SKaiNET provides a powerful Type-Safe DSL for tensor creation. It ensures that the data provided matches the specified `DType` at compile-time (or through the DSL's internal validation).
+
+==== Creation with `ExecutionContext`
+
+Tensors are always created within an `ExecutionContext`, which provides the necessary `TensorOps` and `TensorDataFactory`.
+
+[source,kotlin]
+----
+// Basic creation
+val zeros = ctx.zeros(Shape(2, 3), FP32::class)
+val ones = ctx.ones(Shape(1, 10), Int32::class)
+val full = ctx.full(Shape(5, 5), FP32::class, 42.0f)
+----
+
+==== Expressive Tensor DSL
+
+For more complex initializations, use the `tensor` DSL:
+
+[source,kotlin]
+----
+val myTensor = tensor(ctx, FP32::class) {
+    shape(2, 2) { 
+        from(1.0f, 2.0f, 3.0f, 4.0f) 
+    }
+}
+
+val randomTensor = tensor(ctx, FP32::class) {
+    shape(10, 10) { 
+        randn(mean = 0f, std = 1f) 
+    }
+}
+
+val customInit = tensor(ctx, Int32::class) {
+    shape(5, 5) {
+        init { indices -> indices[0] + indices[1] }
+    }
+}
+----
+
+[[3-slicing-dsl-api]]
+=== 3. Slicing DSL API
+
+SKaiNET offers a sophisticated Slicing DSL that allows for creating views or copies of tensor segments with high precision and readability.
+
+==== `sliceView` vs `sliceCopy`
+
+* *`sliceView`*: Creates a `TensorView`, which is a window into the original data (no data copying).
+* *`sliceCopy`*: Creates a new `Tensor` with a copy of the sliced data.
+
+==== Slicing DSL Syntax
+
+The `SegmentBuilder` provides several ways to define slices for each dimension:
+
+* `range(start, end)`: A range of indices.
+* `at(index)`: A single index (reduces rank).
+* `all()`: All elements in that dimension (equivalent to `:` in NumPy).
+* `step(start, end, step)`: Strided access.
+* `{plus}all()`: Short-hand for `all()`.
+
+[source,kotlin]
+----
+val source = ctx.ones(Shape(10, 20, 30), FP32::class)
+
+// Slicing: [0:5, 10, :]
+val view = source.sliceView {
+    segment { range(0, 5) } // Dim 0
+    segment { at(10) }      // Dim 1
+    segment { all() }       // Dim 2
+}
+----
+
+[[4-core-operations-tensorops]]
+=== 4. Core Operations (`TensorOps`)
+
+All mathematical operations are dispatched through the `TensorOps` interface. SKaiNET supports:
+
+* *Element-wise Ops*: `add`, `subtract`, `multiply`, `divide` (and scalar versions).
+* *Linear Algebra*: `matmul`, `transpose`.
+* *Neural Network Ops*: `conv2d`, `maxPool2d`, `relu`, `softmax`, `sigmoid`, `gelu`.
+* *Reductions*: `sum`, `mean`, `variance`.
+* *Shape Ops*: `reshape`, `flatten`, `concat`, `squeeze`, `unsqueeze`.
+
+==== Operator Overloading
+
+When a tensor is "bound" to ops (e.g., via `OpsBoundTensor`), you can use standard Kotlin operators:
+
+[source,kotlin]
+----
+val c = a + b  // Calls ops.add(a, b)
+val d = a * 10 // Calls ops.mulScalar(a, 10)
+----
+
+[[5-summary-table-skainet-vs-numpy]]
+=== 5. Summary Table: SKaiNET vs NumPy
+
+[cols="<,<,<",options="header",]
+|===
+|Feature |NumPy |SKaiNET
+|*Primary Type* |`ndarray` |`Tensor++<++T, V++>++`
+|*Creation* |`np.array(++[++1, 2, 3++]++)` |`tensor(ctx, FP32::class) ++{++ shape(3) ++{++ from(1f, 2f, 3f) } }`
+|*Zeros* |`np.zeros((2, 2))` |`ctx.zeros(Shape(2, 2), FP32::class)`
+|*Slicing* |`a++[++0:5, :++]++` |`a.sliceView ++{++ segment ++{++ range(0, 5) }; segment ++{++ all() } }`
+|*Matmul* |`a @ b` or `np.matmul(a, b)` |`ctx.ops.matmul(a, b)`
+|*Reshape* |`a.reshape(new++_++shape)` |`ctx.ops.reshape(a, Shape(new++_++shape))`
+|===
+
+[[6-best-practices-for-ai-integration]]
+=== 6. Best Practices for AI Integration
+
+[arabic]
+. *Context Awareness*: Always pass the `ExecutionContext` to functions that create or manipulate tensors.
+. *Type Safety*: Prefer specific `DType` classes (e.g., `FP32::class`, `Int32::class`) to avoid runtime errors.
+. *Views over Copies*: Use `sliceView` whenever possible to minimize memory overhead and improve performance.
+. *Backend Agnostic*: Write logic against the `TensorOps` interface to ensure your code runs on any supported backend.
diff --git a/docs/theory/index.adoc b/docs/modules/ROOT/pages/explanation/theory/index.adoc
similarity index 80%
rename from docs/theory/index.adoc
rename to docs/modules/ROOT/pages/explanation/theory/index.adoc
index e82bd082..80917802 100644
--- a/docs/theory/index.adoc
+++ b/docs/modules/ROOT/pages/explanation/theory/index.adoc
@@ -5,10 +5,6 @@ This section contains mathematical definitions and theoretical foundations for S
 [#operator-theory]
 == Operator Theory
 
-=== Architecture
-
-include::composite-ops.adoc[leveloffset=+2]
-
 === Linear Algebra Operations
 
 include::matmul.adoc[leveloffset=+2]
@@ -36,5 +32,5 @@ include::matmul.adoc[leveloffset=+2]
 [#cross-references]
 == Cross-References
 
-* xref:../examples/index.adoc[Usage Examples]
-* xref:../modules/operators/_generated_/index.adoc[Generated API Reference]
\ No newline at end of file
+* xref:explanation/examples/index.adoc[Usage Examples]
+// Operator reference lands in a later commit of the Antora migration.
\ No newline at end of file
diff --git a/docs/theory/matmul.adoc b/docs/modules/ROOT/pages/explanation/theory/matmul.adoc
similarity index 100%
rename from docs/theory/matmul.adoc
rename to docs/modules/ROOT/pages/explanation/theory/matmul.adoc
diff --git a/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc b/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc
new file mode 100644
index 00000000..7ef1165c
--- /dev/null
+++ b/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc
@@ -0,0 +1,95 @@
+== Arduino C Code Generation
+
+SKaiNET provides a specialized compiler backend for exporting trained neural networks to highly optimized, standalone C99 code suitable for microcontrollers like Arduino.
+
+=== Overview
+
+The Arduino C code generation process transforms a high-level Kotlin model into a memory-efficient C implementation. It prioritizes static memory allocation, minimal overhead, and numerical consistency with the original model.
+
+==== Codegen Pipeline
+
+[mermaid]
+----
+graph TD
+    A[Kotlin Model] --> B[Recording Pass]
+    B --> C[Execution Tape]
+    C --> D[Compute Graph]
+    D --> E[Graph Validation]
+    E --> F[Memory Layout Calculation]
+    F --> G[C Code Emission]
+    G --> H[Arduino Library Packaging]
+    H --> I[Generated .h/.c files]
+----
+
+=== Technical Deep Dive
+
+[[1-tape-based-tracing]]
+==== 1. Tape-based Tracing
+
+Instead of static analysis of the Kotlin code, SKaiNET uses a dynamic tracing mechanism. When you call `exportToArduinoLibrary`, the framework executes a single forward pass of your model using a specialized `RecordingContext`.
+
+* Every operation (Dense, ReLU, etc.) is recorded onto an *Execution Tape*.
+* This approach handles Kotlin's language features (loops, conditionals) naturally, as it only records the actual operations that were executed.
+
+[[2-compute-graph-construction]]
+==== 2. Compute Graph Construction
+
+The execution tape is converted into a directed acyclic graph (DAG) called `ComputeGraph`.
+
+* Nodes represent operations (Ops).
+* Edges represent data flow (Tensors).
+* During this phase, the compiler performs *Shape Inference* to ensure every tensor has a fixed, known size.
+
+[[3-static-memory-management]]
+==== 3. Static Memory Management
+
+Microcontrollers typically have very limited RAM and lack robust heap management. SKaiNET uses a *Ping-Pong Buffer Strategy* to eliminate dynamic memory allocation (`malloc`/`free`) during inference.
+
+===== Ping-Pong Buffer Strategy
+
+The compiler calculates the maximum size required for any intermediate tensor in the graph and allocates exactly two static buffers of that size.
+
+[mermaid]
+----
+sequenceDiagram
+    participant I as Input
+    participant B1 as Buffer A
+    participant B2 as Buffer B
+    participant O as Output
+    
+    I->>B1: Layer 1 (Input -> A)
+    B1->>B2: Layer 2 (A -> B)
+    B2->>B1: Layer 3 (B -> A)
+    B1->>O: Layer 4 (A -> Output)
+----
+
+* *Buffer Reuse*: Instead of allocating space for every layer's output, buffers are reused.
+* *Direct Output Optimization*: The first layer reads from the input pointer, and the last layer writes directly to the output pointer, avoiding unnecessary copies.
+
+[[4-code-generation-emission]]
+==== 4. Code Generation (Emission)
+
+The `CCodeGenerator` emits C99-compatible code using templates.
+
+* *Weights & Biases*: Extracted from the trained Kotlin model and serialized as `static const float` arrays. This places them in Flash memory (PROGMEM) on many microcontrollers, saving precious RAM.
+* *Kernel Implementation*: Operations like `Dense` (Linear) are implemented as optimized nested loops.
+* *Header Generation*: Produces a clean API for the user:
++
+[source,c]
+----
+int model_inference(const float* input, float* output);
+----
+
+[[5-validation]]
+==== 5. Validation
+
+The generator performs post-generation validation:
+
+* *Static Allocation Check*: Ensures no dynamic allocation is present in the generated source.
+* *Buffer Alternation Check*: Verifies that the ping-pong strategy is correctly implemented without data races or overwrites.
+
+=== Performance and Constraints
+
+* *Floating Point*: Currently optimized for `FP32`.
+* *Supported Ops*: `Dense`, `ReLU`, `Sigmoid`, `Tanh`, `Add`, `MatMul`.
+* *Memory*: Total memory consumption is `TotalWeights {plus} 2 ++*++ MaxIntermediateTensor`.
diff --git a/docs/modules/ROOT/pages/how-to/build.adoc b/docs/modules/ROOT/pages/how-to/build.adoc
new file mode 100644
index 00000000..c3a6b6d6
--- /dev/null
+++ b/docs/modules/ROOT/pages/how-to/build.adoc
@@ -0,0 +1,87 @@
+== Build Help
+
+=== Dokka API Documentation
+
+SKaiNET uses https://github.com/Kotlin/dokka[Dokka 2.1.0] to generate API reference documentation across all public library modules. A shared convention plugin (`sk.ainet.dokka`) standardises the configuration.
+
+==== Generating docs locally
+
+*Single module:*
+
+[source,bash]
+----
+./gradlew :skainet-lang:skainet-lang-core:dokkaGeneratePublicationHtml
+----
+
+Output: `skainet-lang/skainet-lang-core/build/dokka/html/`
+
+*Aggregated (all modules):*
+
+[source,bash]
+----
+./gradlew dokkaGenerate
+----
+
+Output: `build/dokka/html/index.html`
+
+==== Convention plugin details
+
+The `sk.ainet.dokka` precompiled script plugin (`build-logic/convention/src/main/kotlin/sk.ainet.dokka.gradle.kts`) applies `org.jetbrains.dokka` and configures:
+
+* *moduleName* from `project.name`
+* *moduleVersion* from the `VERSION++_++NAME` Gradle property
+* *Documented visibilities:* public only
+* *Suppressed generated files:* KSP-generated code is excluded
+* *Suppressed native source sets:* `iosArm64Main`, `iosSimulatorArm64Main`, `macosArm64Main`, `linuxX64Main`, `linuxArm64Main` are suppressed because Dokka 2.x cannot translate native cinterop symbols
+* *Source links* pointing to the GitHub repository
+
+==== Modules with Dokka enabled
+
+The plugin is applied to 21 library modules:
+
+[cols=",",options="header",]
+|===
+|Group |Modules
+|skainet-lang |`skainet-lang-core`, `skainet-lang-models`, `skainet-lang-ksp-annotations`, `skainet-lang-dag`
+|skainet-compile |`skainet-compile-core`, `skainet-compile-dag`, `skainet-compile-json`, `skainet-compile-hlo`, `skainet-compile-c`
+|skainet-backends |`skainet-backend-cpu`
+|skainet-data |`skainet-data-api`, `skainet-data-transform`, `skainet-data-simple`, `skainet-data-media`
+|skainet-io |`skainet-io-core`, `skainet-io-gguf`, `skainet-io-image`, `skainet-io-onnx`, `skainet-io-safetensors`
+|Other |`skainet-pipeline`, `skainet-model-yolo`
+|===
+
+*Excluded:* `skainet-bom` (no source), `skainet-apps/++*++`, `skainet-test/++*++`, benchmarks, and `skainet-lang-ksp-processor` (internal).
+
+==== Root-level aggregation
+
+The root `build.gradle.kts` applies the Dokka plugin directly (not `apply false`) and declares `dokka(project(...))` dependencies for all 21 modules. Running `./gradlew dokkaGenerate` at the root produces a unified API reference that includes every module under a single `SKaiNET` namespace. The root `README.md` is included as the landing page.
+
+==== KSP interaction
+
+`skainet-lang-core` and `skainet-lang-dag` use KSP to generate source code. Their build files include:
+
+[source,kotlin]
+----
+tasks.matching { it.name.startsWith("dokka") }.configureEach {
+    dependsOn("kspCommonMainKotlinMetadata")
+}
+----
+
+This ensures KSP-generated sources are available before Dokka runs.
+
+==== GitHub Pages deployment
+
+The workflow `.github/workflows/dokka-pages.yml` runs on push to `main` (and manually via `workflow++_++dispatch`). It:
+
+[arabic]
+. Checks out the repo
+. Sets up JDK 25
+. Runs `./gradlew dokkaGenerate`
+. Uploads the `build/dokka/html` directory as a Pages artifact
+. Deploys to GitHub Pages using `actions/deploy-pages@v4`
+
+*Prerequisite:* The repository must have Pages configured to deploy from GitHub Actions (Settings ++>++ Pages ++>++ Source: "GitHub Actions").
+
+==== Operator docs (unchanged)
+
+The existing operator documentation pipeline (`./gradlew generateDocs`) is unrelated to Dokka and continues to work as before.
diff --git a/docs/io-readers-guide.md b/docs/modules/ROOT/pages/how-to/io-readers.adoc
similarity index 88%
rename from docs/io-readers-guide.md
rename to docs/modules/ROOT/pages/how-to/io-readers.adoc
index d431a7c3..1f4b18da 100644
--- a/docs/io-readers-guide.md
+++ b/docs/modules/ROOT/pages/how-to/io-readers.adoc
@@ -1,48 +1,54 @@
-# SKaiNET I/O Readers Guide
+== SKaiNET I/O Readers Guide
 
 This guide demonstrates how to use SKaiNET's GGUF and ONNX readers in your Kotlin Multiplatform projects.
 
-## Overview
+=== Overview
 
 SKaiNET provides two main I/O modules for reading AI model formats:
-- **skainet-io-gguf**: For reading GGUF (GPT-Generated Unified Format) files
-- **skainet-io-onnx**: For reading ONNX (Open Neural Network Exchange) files
+
+* *skainet-io-gguf*: For reading GGUF (GPT-Generated Unified Format) files
+* *skainet-io-onnx*: For reading ONNX (Open Neural Network Exchange) files
 
 Both modules are built on Kotlin Multiplatform and support JVM, Android, iOS, JS, WASM, and Native platforms.
 
-## Dependencies
+=== Dependencies
 
 Add the following dependencies to your `build.gradle.kts`:
 
-### For GGUF Support
+==== For GGUF Support
 
-```kotlin
+[source,kotlin]
+----
 dependencies {
     implementation("sk.ainet.core:skainet-io-gguf:0.5.0")
     implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2")
 }
-```
+----
 
-### For ONNX Support
+==== For ONNX Support
 
-```kotlin
+[source,kotlin]
+----
 dependencies {
     implementation("sk.ainet.core:skainet-io-onnx:0.5.0")
     implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2")
     implementation("pro.streem.pbandk:pbandk-runtime:0.16.0")
 }
-```
+----
 
-## GGUF Reader Usage
+=== GGUF Reader Usage
 
-> **Recommended:** For large model files, use `StreamingGGUFReader` instead of `GGUFReader`.
-> The streaming reader parses only metadata (~1 MB) and loads tensors on-demand, supporting
-> files over 100 GB without heap-loading the entire file. It also supports quantized types
-> (Q4_K, Q8_0, etc.) via `StreamingGgufParametersLoader`. See the streaming examples below.
+____
+*Recommended:* For large model files, use `StreamingGGUFReader` instead of `GGUFReader`.
+The streaming reader parses only metadata (~1 MB) and loads tensors on-demand, supporting
+files over 100 GB without heap-loading the entire file. It also supports quantized types
+(Q4++_++K, Q8++_++0, etc.) via `StreamingGgufParametersLoader`. See the streaming examples below.
+____
 
-### Streaming GGUF Reading (Recommended)
+==== Streaming GGUF Reading (Recommended)
 
-```kotlin
+[source,kotlin]
+----
 import sk.ainet.io.JvmRandomAccessSource
 import sk.ainet.io.gguf.StreamingGGUFReader
 
@@ -60,16 +66,19 @@ fun readLargeModel(filePath: String) {
         println("Encoding: ${storage.encoding.name}, Physical: ${storage.physicalBytes} bytes")
     }
 }
-```
+----
 
-### Legacy GGUF Reading
+==== Legacy GGUF Reading
 
-> **Note:** The legacy `GGUFReader` loads the entire file into memory and only supports
-> F32/I32 tensors. Prefer `StreamingGGUFReader` for new code.
+____
+*Note:* The legacy `GGUFReader` loads the entire file into memory and only supports
+F32/I32 tensors. Prefer `StreamingGGUFReader` for new code.
+____
 
-### Basic GGUF Reading
+==== Basic GGUF Reading
 
-```kotlin
+[source,kotlin]
+----
 import kotlinx.io.Source
 import kotlinx.io.asSource
 import kotlinx.io.buffered
@@ -114,11 +123,12 @@ suspend fun readGGUFModel(filePath: String) {
         }
     }
 }
-```
+----
 
-### Working with Tensor Data
+==== Working with Tensor Data
 
-```kotlin
+[source,kotlin]
+----
 import sk.ainet.io.gguf.GGUFReader
 import sk.ainet.io.gguf.GGMLQuantizationType
 
@@ -155,11 +165,12 @@ fun processTensorData(reader: GGUFReader) {
         }
     }
 }
-```
+----
 
-### Lazy Loading for Large Models
+==== Lazy Loading for Large Models
 
-```kotlin
+[source,kotlin]
+----
 import sk.ainet.io.gguf.GGUFReader
 
 fun readGGUFMetadataOnly(filePath: String) {
@@ -184,13 +195,14 @@ fun readGGUFMetadataOnly(filePath: String) {
         }
     }
 }
-```
+----
 
-## ONNX Reader Usage
+=== ONNX Reader Usage
 
-### Basic ONNX Reading
+==== Basic ONNX Reading
 
-```kotlin
+[source,kotlin]
+----
 import kotlinx.io.Source
 import kotlinx.io.asSource
 import sk.ainet.io.onnx.OnnxLoader
@@ -229,11 +241,12 @@ suspend fun readONNXModel(filePath: String) {
         println("  Outputs: ${graph.output.size}")
     }
 }
-```
+----
 
-### Working with ONNX Graph Structure
+==== Working with ONNX Graph Structure
 
-```kotlin
+[source,kotlin]
+----
 import onnx.ModelProto
 import onnx.NodeProto
 import onnx.TensorProto
@@ -286,11 +299,12 @@ fun getAttributeValue(attr: onnx.AttributeProto): String {
 fun getTensorShapeString(tensor: TensorProto): String {
     return tensor.dims.joinToString("x") { it.toString() }
 }
-```
+----
 
-### Custom ONNX Loader with Error Handling
+==== Custom ONNX Loader with Error Handling
 
-```kotlin
+[source,kotlin]
+----
 import kotlinx.io.Source
 import sk.ainet.io.onnx.OnnxLoader
 import sk.ainet.io.onnx.OnnxLoadedModel
@@ -352,13 +366,14 @@ suspend fun safeLoadOnnx(filePath: String) {
             println("Failed to load ONNX model: ${error.message}")
         }
 }
-```
+----
 
-## Platform-Specific Considerations
+=== Platform-Specific Considerations
 
-### JVM Platform
+==== JVM Platform
 
-```kotlin
+[source,kotlin]
+----
 // JVM-specific file reading
 import java.io.File
 import java.nio.file.Path
@@ -366,11 +381,12 @@ import java.nio.file.Path
 fun readFromFile(path: Path): Source {
     return path.toFile().inputStream().asSource().buffered()
 }
-```
+----
 
-### Android Platform
+==== Android Platform
 
-```kotlin
+[source,kotlin]
+----
 // Android-specific asset reading
 import android.content.Context
 import android.content.res.AssetManager
@@ -378,11 +394,12 @@ import android.content.res.AssetManager
 fun readFromAssets(context: Context, fileName: String): Source {
     return context.assets.open(fileName).asSource().buffered()
 }
-```
+----
 
-### iOS/Native Platform
+==== iOS/Native Platform
 
-```kotlin
+[source,kotlin]
+----
 // Native platform file reading
 import kotlinx.io.files.Path
 import kotlinx.io.files.SystemFileSystem
@@ -391,13 +408,14 @@ fun readFromNativePath(pathString: String): Source {
     val path = Path(pathString)
     return SystemFileSystem.source(path).buffered()
 }
-```
+----
 
-## Performance Tips
+=== Performance Tips
 
-### Memory Management
+==== Memory Management
 
-```kotlin
+[source,kotlin]
+----
 // For large models, consider streaming or chunked processing
 fun processLargeModel(reader: GGUFReader) {
     // Process tensors one at a time to manage memory
@@ -411,11 +429,12 @@ fun processLargeModel(reader: GGUFReader) {
         }
     }
 }
-```
+----
 
-### Lazy Loading Strategy
+==== Lazy Loading Strategy
 
-```kotlin
+[source,kotlin]
+----
 class ModelManager {
     private var reader: GGUFReader? = null
     private val tensorCache = mutableMapOf<String, List<Any>>()
@@ -431,11 +450,12 @@ class ModelManager {
         }
     }
 }
-```
+----
 
-## Error Handling Best Practices
+=== Error Handling Best Practices
 
-```kotlin
+[source,kotlin]
+----
 sealed class ModelLoadResult<T> {
     data class Success<T>(val model: T) : ModelLoadResult<T>()
     data class Error<T>(val message: String, val cause: Throwable? = null) : ModelLoadResult<T>()
@@ -459,13 +479,14 @@ suspend fun loadModelSafely(filePath: String): ModelLoadResult<GGUFReader> {
         ModelLoadResult.Error("Failed to load model: ${e.message}", e)
     }
 }
-```
+----
 
-## Integration Examples
+=== Integration Examples
 
-### Using with Coroutines
+==== Using with Coroutines
 
-```kotlin
+[source,kotlin]
+----
 import kotlinx.coroutines.*
 
 class AsyncModelLoader {
@@ -495,6 +516,6 @@ class AsyncModelLoader {
 }
 
 data class ProcessedTensor(val name: String, val size: Int)
-```
+----
 
-This guide provides comprehensive examples for using SKaiNET's I/O readers in your projects. The readers are designed to be efficient, multiplatform-compatible, and easy to integrate into existing Kotlin applications.
\ No newline at end of file
+This guide provides comprehensive examples for using SKaiNET's I/O readers in your projects. The readers are designed to be efficient, multiplatform-compatible, and easy to integrate into existing Kotlin applications.
diff --git a/docs/java-cli-app.md b/docs/modules/ROOT/pages/how-to/java-cli-app.adoc
similarity index 85%
rename from docs/java-cli-app.md
rename to docs/modules/ROOT/pages/how-to/java-cli-app.adoc
index c2288a5d..a233942d 100644
--- a/docs/java-cli-app.md
+++ b/docs/modules/ROOT/pages/how-to/java-cli-app.adoc
@@ -1,22 +1,23 @@
-# Building a Java CLI App with KLlama
+== Building a Java CLI App with KLlama
 
-This guide walks you through creating a standalone Java 21+ command-line application that loads a LLaMA model and generates text using the KLlama library.
+This guide walks you through creating a standalone Java 21{plus} command-line application that loads a LLaMA model and generates text using the KLlama library.
 
-## Prerequisites
+=== Prerequisites
 
-- **JDK 21 or later** (required for Vector API and virtual threads)
-- **Maven 3.8+** or **Gradle 8.4+**
-- A GGUF model file (e.g., [TinyLlama-1.1B-Chat GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF))
+* *JDK 21 or later* (required for Vector API and virtual threads)
+* *Maven 3.8{plus}* or *Gradle 8.4{plus}*
+* A GGUF model file (e.g., https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF[TinyLlama-1.1B-Chat GGUF])
 
----
+'''''
 
-## Project Setup
+=== Project Setup
 
-### Maven
+==== Maven
 
 Create a `pom.xml`:
 
-```xml
+[source,xml]
+----
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
@@ -112,13 +113,14 @@ Create a `pom.xml`:
         </plugins>
     </build>
 </project>
-```
+----
 
-### Gradle
+==== Gradle
 
 Create a `build.gradle` (Groovy DSL):
 
-```groovy
+[source,groovy]
+----
 plugins {
     id 'java'
     id 'application'
@@ -151,15 +153,16 @@ application {
 tasks.withType(JavaCompile).configureEach {
     options.compilerArgs.add('--enable-preview')
 }
-```
+----
 
----
+'''''
 
-## Source Code
+=== Source Code
 
 Create `src/main/java/com/example/KLlamaCli.java`:
 
-```java
+[source,java]
+----
 package com.example;
 
 import sk.ainet.apps.kllama.java.GenerationConfig;
@@ -194,15 +197,16 @@ public class KLlamaCli {
         }
     }
 }
-```
+----
 
----
+'''''
 
-## Building and Running
+=== Building and Running
 
-### With Maven
+==== With Maven
 
-```bash
+[source,bash]
+----
 # Run directly
 mvn compile exec:java -Dexec.args="model.gguf 'Once upon a time' 128 0.7"
 
@@ -213,11 +217,12 @@ mvn package
 java --enable-preview --add-modules jdk.incubator.vector \
      -jar target/kllama-cli-1.0-SNAPSHOT.jar \
      model.gguf "Once upon a time" 128 0.7
-```
+----
 
-### With Gradle
+==== With Gradle
 
-```bash
+[source,bash]
+----
 # Run directly
 ./gradlew run --args="model.gguf 'Once upon a time' 128 0.7"
 
@@ -227,28 +232,30 @@ java --enable-preview --add-modules jdk.incubator.vector \
 # Run from distribution
 ./build/install/kllama-cli/bin/kllama-cli \
      model.gguf "Once upon a time" 128 0.7
-```
+----
 
----
+'''''
 
-## Loading SafeTensors Models
+=== Loading SafeTensors Models
 
 To load a HuggingFace model directory instead of GGUF, use `loadSafeTensors` and point to the directory containing `model.safetensors`, `config.json`, and `tokenizer.json`:
 
-```java
+[source,java]
+----
 try (KLlamaSession session = KLlamaJava.loadSafeTensors(Path.of("./my-llama-model/"))) {
     session.generate("Hello", config, token -> System.out.print(token));
     System.out.println();
 }
-```
+----
 
----
+'''''
 
-## Async Generation
+=== Async Generation
 
 Use `generateAsync` to run generation on a virtual thread and get a `CompletableFuture`:
 
-```java
+[source,java]
+----
 import java.util.concurrent.CompletableFuture;
 
 try (KLlamaSession session = KLlamaJava.loadGGUF(modelPath)) {
@@ -262,20 +269,21 @@ try (KLlamaSession session = KLlamaJava.loadGGUF(modelPath)) {
     String result = future.join();
     System.out.println(result);
 }
-```
+----
 
 You can also compose futures:
 
-```java
+[source,java]
+----
 session.generateAsync("Translate to French: Hello world")
        .thenAccept(translation -> System.out.println("Translation: " + translation))
        .exceptionally(ex -> { ex.printStackTrace(); return null; });
-```
+----
 
----
+'''''
 
-## Next Steps
+=== Next Steps
 
-- [Java LLM Inference Guide](java-llm-inference.md) — BERT embeddings, agent/tool-calling, and more.
-- [Java Getting Started](java-getting-started.md) — tensor operations, full Maven/Gradle setup.
-- [KLlama Library](../skainet-apps/skainet-kllama/README.md) — custom backends and Kotlin embedding.
+* link:java-llm-inference.md[Java LLM Inference Guide] — BERT embeddings, agent/tool-calling, and more.
+* link:java-getting-started.md[Java Getting Started] — tensor operations, full Maven/Gradle setup.
+* link:../skainet-apps/skainet-kllama/README.md[KLlama Library] — custom backends and Kotlin embedding.
diff --git a/docs/java-llm-inference.md b/docs/modules/ROOT/pages/how-to/java-llm-inference.adoc
similarity index 78%
rename from docs/java-llm-inference.md
rename to docs/modules/ROOT/pages/how-to/java-llm-inference.adoc
index feb94244..567b9aa1 100644
--- a/docs/java-llm-inference.md
+++ b/docs/modules/ROOT/pages/how-to/java-llm-inference.adoc
@@ -1,15 +1,16 @@
-# Java LLM Inference Guide
+== Java LLM Inference Guide
 
 This guide covers loading and running large language models (LLaMA, BERT) from Java using SKaiNET's blocking, streaming, and async APIs.
 
-## Prerequisites
+=== Prerequisites
 
-- JDK 21+ with `--enable-preview --add-modules jdk.incubator.vector`
-- See [Java Getting Started](java-getting-started.md) for project setup
+* JDK 21{plus} with `--enable-preview --add-modules jdk.incubator.vector`
+* See link:java-getting-started.md[Java Getting Started] for project setup
 
-### Maven Dependencies
+==== Maven Dependencies
 
-```xml
+[source,xml]
+----
 <dependencyManagement>
     <dependencies>
         <dependency>
@@ -47,19 +48,20 @@ This guide covers loading and running large language models (LLaMA, BERT) from J
         <artifactId>skainet-backend-cpu-jvm</artifactId>
     </dependency>
 </dependencies>
-```
+----
 
----
+'''''
 
-## LLaMA Inference
+=== LLaMA Inference
 
 All LLaMA Java classes live in `sk.ainet.apps.kllama.java`.
 
-### Loading a GGUF Model
+==== Loading a GGUF Model
 
 The simplest way to get started is to load a GGUF file. `KLlamaJava.loadGGUF()` handles context creation, weight loading, quantization dispatch, and tokenizer setup behind the scenes.
 
-```java
+[source,java]
+----
 import sk.ainet.apps.kllama.java.KLlamaJava;
 import sk.ainet.apps.kllama.java.KLlamaSession;
 import sk.ainet.apps.kllama.java.GenerationConfig;
@@ -73,50 +75,54 @@ public class LlamaExample {
         }
     }
 }
-```
+----
 
 `KLlamaSession` implements `AutoCloseable`, so `try-with-resources` properly releases the off-heap memory arenas when you are done.
 
-### Loading SafeTensors (HuggingFace Format)
+==== Loading SafeTensors (HuggingFace Format)
 
 If you have a HuggingFace model directory containing `model.safetensors`, `config.json`, and `tokenizer.json`:
 
-```java
+[source,java]
+----
 try (KLlamaSession session = KLlamaJava.loadSafeTensors(Path.of("./my-llama-model/"))) {
     String response = session.generate("Once upon a time");
     System.out.println(response);
 }
-```
+----
 
 The directory must contain:
-- `model.safetensors` -- the model weights
-- `config.json` -- model architecture config (hidden size, layers, heads, etc.)
-- `tokenizer.json` -- HuggingFace tokenizer definition
 
----
+* `model.safetensors` -- the model weights
+* `config.json` -- model architecture config (hidden size, layers, heads, etc.)
+* `tokenizer.json` -- HuggingFace tokenizer definition
 
-## GenerationConfig
+'''''
+
+=== GenerationConfig
 
 Control generation parameters with the builder pattern:
 
-```java
+[source,java]
+----
 GenerationConfig config = GenerationConfig.builder()
         .maxTokens(256)       // maximum tokens to generate (default: 256)
         .temperature(0.7f)    // sampling temperature (default: 0.8)
         .build();
 
 String response = session.generate("Explain quantum computing", config);
-```
+----
 
 Use `GenerationConfig.defaults()` for the default configuration (256 max tokens, 0.8 temperature).
 
----
+'''''
 
-## Streaming Generation
+=== Streaming Generation
 
-Pass a `Consumer<String>` to receive each token as it is generated. This is useful for displaying output in real time:
+Pass a `Consumer++<++String++>++` to receive each token as it is generated. This is useful for displaying output in real time:
 
-```java
+[source,java]
+----
 GenerationConfig config = GenerationConfig.builder()
         .maxTokens(512)
         .temperature(0.9f)
@@ -129,17 +135,18 @@ String fullResponse = session.generate(
 );
 
 System.out.println();  // newline after streaming
-```
+----
 
-The `generate` overload with a `Consumer<String>` still returns the complete generated text as its return value.
+The `generate` overload with a `Consumer++<++String++>++` still returns the complete generated text as its return value.
 
----
+'''''
 
-## Async Generation
+=== Async Generation
 
-`generateAsync` offloads generation to a virtual thread and returns a `CompletableFuture<String>`:
+`generateAsync` offloads generation to a virtual thread and returns a `CompletableFuture++<++String++>++`:
 
-```java
+[source,java]
+----
 import java.util.concurrent.CompletableFuture;
 
 CompletableFuture<String> future = session.generateAsync(
@@ -150,27 +157,29 @@ CompletableFuture<String> future = session.generateAsync(
 // Do other work while generation runs...
 String result = future.join();  // block when you need the result
 System.out.println(result);
-```
+----
 
 You can also compose futures:
 
-```java
+[source,java]
+----
 session.generateAsync("Translate to French: Hello world")
        .thenAccept(translation -> System.out.println("Translation: " + translation))
        .exceptionally(ex -> { ex.printStackTrace(); return null; });
-```
+----
 
----
+'''''
 
-## BERT Encoding and Similarity
+=== BERT Encoding and Similarity
 
 All BERT Java classes live in `sk.ainet.apps.bert.java`.
 
-### Loading a BERT Model
+==== Loading a BERT Model
 
 Load a BERT model from a HuggingFace directory containing `model.safetensors` and `vocab.txt`:
 
-```java
+[source,java]
+----
 import sk.ainet.apps.bert.java.KBertJava;
 import sk.ainet.apps.bert.java.KBertSession;
 import java.nio.file.Path;
@@ -180,18 +189,20 @@ try (KBertSession bert = KBertJava.loadSafeTensors(Path.of("./bert-base-uncased/
     float[] embedding = bert.encode("SKaiNET is a tensor framework");
     System.out.println("Embedding dimension: " + embedding.length);
 }
-```
+----
 
 The directory must contain:
-- `model.safetensors` -- BERT model weights
-- `vocab.txt` -- WordPiece vocabulary
-- `config.json` (optional) -- model config; defaults are used if absent
 
-### Similarity Scoring
+* `model.safetensors` -- BERT model weights
+* `vocab.txt` -- WordPiece vocabulary
+* `config.json` (optional) -- model config; defaults are used if absent
+
+==== Similarity Scoring
 
 Compute cosine similarity between two texts directly:
 
-```java
+[source,java]
+----
 try (KBertSession bert = KBertJava.loadSafeTensors(Path.of("./bert-base-uncased/"))) {
     float score = bert.similarity(
             "The cat sat on the mat",
@@ -206,21 +217,22 @@ try (KBertSession bert = KBertJava.loadSafeTensors(Path.of("./bert-base-uncased/
     );
     System.out.printf("Unrelated:  %.4f%n", low);    // e.g. 0.1247
 }
-```
+----
 
-The returned value is cosine similarity in the range [-1, 1].
+The returned value is cosine similarity in the range ++[++-1, 1++]++.
 
----
+'''''
 
-## Agent Loop and Tool Calling
+=== Agent Loop and Tool Calling
 
 All agent/tool classes live in `sk.ainet.apps.kllama.chat.java`.
 
 The `JavaAgentLoop` lets the LLM call tools in a loop until it produces a final answer. You define tools by implementing the `JavaTool` interface.
 
-### Defining a Tool
+==== Defining a Tool
 
-```java
+[source,java]
+----
 import sk.ainet.apps.kllama.chat.java.JavaTool;
 import sk.ainet.apps.kllama.chat.ToolDefinition;
 import java.util.Map;
@@ -255,11 +267,12 @@ public class CalculatorTool implements JavaTool {
         return 0.0;
     }
 }
-```
+----
 
-### Building and Using the Agent
+==== Building and Using the Agent
 
-```java
+[source,java]
+----
 import sk.ainet.apps.kllama.java.KLlamaJava;
 import sk.ainet.apps.kllama.java.KLlamaSession;
 import sk.ainet.apps.kllama.chat.java.JavaAgentLoop;
@@ -285,24 +298,26 @@ try (KLlamaSession session = KLlamaJava.loadGGUF(Path.of("model.gguf"))) {
     // Reset conversation history (keeps system prompt)
     agent.reset();
 }
-```
+----
 
-### Streaming Agent Responses
+==== Streaming Agent Responses
 
-```java
+[source,java]
+----
 String answer = agent.chat(
         "What is the square root of 144?",
         token -> System.out.print(token)
 );
-```
+----
 
----
+'''''
 
-## Resource Management
+=== Resource Management
 
 Both `KLlamaSession` and `KBertSession` implement `AutoCloseable`. Always use `try-with-resources` to ensure off-heap memory arenas and other native resources are released promptly:
 
-```java
+[source,java]
+----
 // Single session
 try (KLlamaSession session = KLlamaJava.loadGGUF(path)) {
     session.generate("Hello");
@@ -315,23 +330,25 @@ try (KLlamaSession llama = KLlamaJava.loadGGUF(llamaPath);
     String text = llama.generate("Write a summary of quantum mechanics");
     float[] embedding = bert.encode(text);
 }
-```
+----
 
 Failing to close sessions will leak off-heap memory allocated via `java.lang.foreign.Arena`.
 
----
+'''''
 
-## Package Reference
+=== Package Reference
 
-| Package                                | Key Classes                                 |
-|----------------------------------------|---------------------------------------------|
-| `sk.ainet.apps.kllama.java`           | `KLlamaJava`, `KLlamaSession`, `GenerationConfig` |
-| `sk.ainet.apps.bert.java`             | `KBertJava`, `KBertSession`                 |
-| `sk.ainet.apps.kllama.chat.java`      | `JavaAgentLoop`, `JavaTool`                 |
+[cols=",",options="header",]
+|===
+|Package |Key Classes
+|`sk.ainet.apps.kllama.java` |`KLlamaJava`, `KLlamaSession`, `GenerationConfig`
+|`sk.ainet.apps.bert.java` |`KBertJava`, `KBertSession`
+|`sk.ainet.apps.kllama.chat.java` |`JavaAgentLoop`, `JavaTool`
+|===
 
----
+'''''
 
-## Next Steps
+=== Next Steps
 
-- [Java Getting Started](java-getting-started.md) -- tensor operations, project setup, and dependency management.
-- [Model Training Guide](java-model-training.md) -- build and train neural networks from Java.
+* link:java-getting-started.md[Java Getting Started] -- tensor operations, project setup, and dependency management.
+* link:java-model-training.md[Model Training Guide] -- build and train neural networks from Java.
diff --git a/docs/java-model-training.md b/docs/modules/ROOT/pages/how-to/java-model-training.adoc
similarity index 80%
rename from docs/java-model-training.md
rename to docs/modules/ROOT/pages/how-to/java-model-training.adoc
index 92e3e9cc..2abf7d17 100644
--- a/docs/java-model-training.md
+++ b/docs/modules/ROOT/pages/how-to/java-model-training.adoc
@@ -1,15 +1,16 @@
-# Java Model Training Guide
+== Java Model Training Guide
 
 This guide covers building neural networks, defining loss functions and optimizers, loading datasets, and running training loops -- all from plain Java.
 
-## Prerequisites
+=== Prerequisites
 
-- JDK 21+ with `--enable-preview --add-modules jdk.incubator.vector`
-- See [Java Getting Started](java-getting-started.md) for project setup
+* JDK 21{plus} with `--enable-preview --add-modules jdk.incubator.vector`
+* See link:java-getting-started.md[Java Getting Started] for project setup
 
-### Maven Dependencies
+==== Maven Dependencies
 
-```xml
+[source,xml]
+----
 <dependencyManagement>
     <dependencies>
         <dependency>
@@ -41,15 +42,16 @@ This guide covers building neural networks, defining loss functions and optimize
         <artifactId>skainet-data-simple-jvm</artifactId>
     </dependency>
 </dependencies>
-```
+----
 
----
+'''''
 
-## Building a Model with SequentialModelBuilder
+=== Building a Model with SequentialModelBuilder
 
 `SequentialModelBuilder` provides a fluent API for stacking dense layers and activations. It lives in `sk.ainet.java`.
 
-```java
+[source,java]
+----
 import sk.ainet.java.SKaiNET;
 import sk.ainet.java.SequentialModelBuilder;
 import sk.ainet.lang.nn.Module;
@@ -63,39 +65,43 @@ Module model = new SequentialModelBuilder(ctx)
         .relu()           // ReLU activation
         .dense(10)        // fully connected: 128 -> 10 (digit classes)
         .build();
-```
-
-### Available Layers and Activations
-
-| Method                  | Description                              |
-|-------------------------|------------------------------------------|
-| `.input(size)`          | Set the input dimension (must be first)  |
-| `.dense(outputSize)`    | Fully connected (linear) layer           |
-| `.relu()`               | ReLU activation: max(0, x)               |
-| `.sigmoid()`            | Sigmoid activation                       |
-| `.silu()`               | SiLU / Swish activation: x * sigmoid(x) |
-| `.gelu()`               | GELU activation                          |
-| `.softmax(dim)`         | Softmax along a dimension (default: -1)  |
-| `.flatten(start, end)`  | Flatten dimensions                       |
+----
+
+==== Available Layers and Activations
+
+[cols=",",options="header",]
+|===
+|Method |Description
+|`.input(size)` |Set the input dimension (must be first)
+|`.dense(outputSize)` |Fully connected (linear) layer
+|`.relu()` |ReLU activation: max(0, x)
+|`.sigmoid()` |Sigmoid activation
+|`.silu()` |SiLU / Swish activation: x ++*++ sigmoid(x)
+|`.gelu()` |GELU activation
+|`.softmax(dim)` |Softmax along a dimension (default: -1)
+|`.flatten(start, end)` |Flatten dimensions
+|===
 
 Weights are initialized using Xavier initialization. The data type defaults to FP32; pass a `DType` to the constructor to change it:
 
-```java
+[source,java]
+----
 Module model = new SequentialModelBuilder(ctx, DType.fp16())
         .input(784)
         .dense(256)
         .gelu()
         .dense(10)
         .build();
-```
+----
 
----
+'''''
 
-## Losses
+=== Losses
 
 The `Losses` factory (in `sk.ainet.java`) creates loss function instances:
 
-```java
+[source,java]
+----
 import sk.ainet.java.Losses;
 import sk.ainet.lang.nn.loss.Loss;
 
@@ -110,15 +116,16 @@ Loss hub  = Losses.huber(1.0f);                      // Huber / Smooth L1
 Loss hin  = Losses.hinge(1.0f);                      // hinge loss
 Loss shin = Losses.squaredHinge(1.0f);               // squared hinge
 Loss poi  = Losses.poisson();                        // Poisson NLL
-```
+----
 
----
+'''''
 
-## Optimizers
+=== Optimizers
 
 The `Optimizers` factory (in `sk.ainet.java`) creates optimizer instances:
 
-```java
+[source,java]
+----
 import sk.ainet.java.Optimizers;
 import sk.ainet.lang.nn.optim.Optimizer;
 
@@ -136,15 +143,16 @@ Optimizer sgd = Optimizers.sgd(0.01, 0.9);
 
 // SGD with momentum and weight decay
 Optimizer sgdWd = Optimizers.sgd(0.01, 0.9, 0.0001);
-```
+----
 
----
+'''''
 
-## TrainingLoop
+=== TrainingLoop
 
 `TrainingLoop` ties together a model, loss function, optimizer, and execution context. Build it with the static builder:
 
-```java
+[source,java]
+----
 import sk.ainet.java.TrainingLoop;
 
 TrainingLoop loop = TrainingLoop.builder()
@@ -153,22 +161,25 @@ TrainingLoop loop = TrainingLoop.builder()
         .optimizer(Optimizers.adam(0.001))
         .context(ctx)
         .build();
-```
+----
 
-### Single Training Step
+==== Single Training Step
 
 `step(x, y)` performs one forward pass, computes the loss, backpropagates, and updates weights. It returns the loss as a `float`:
 
-```java
+[source,java]
+----
 float loss = loop.step(inputBatch, targetBatch);
 System.out.printf("Step loss: %.4f%n", loss);
-```
+----
 
-### Full Training with `.train()`
+[[full-training-with-train]]
+==== Full Training with `.train()`
 
 `train()` accepts a `Supplier` that produces an `Iterator` of `(input, target)` pairs for each epoch:
 
-```java
+[source,java]
+----
 import sk.ainet.java.TrainingResult;
 import kotlin.Pair;
 
@@ -179,15 +190,17 @@ TrainingResult result = loop.train(
 
 System.out.printf("Trained %d epochs, final loss: %.4f%n",
         result.getEpochs(), result.getFinalLoss());
-```
+----
 
 Each call to the supplier should return a fresh iterator over the training batches for that epoch. This allows reshuffling between epochs.
 
-### Async Training with `.trainAsync()`
+[[async-training-with-trainasync]]
+==== Async Training with `.trainAsync()`
 
-`trainAsync()` runs the training loop on a virtual thread and returns a `CompletableFuture<TrainingResult>`:
+`trainAsync()` runs the training loop on a virtual thread and returns a `CompletableFuture++<++TrainingResult++>++`:
 
-```java
+[source,java]
+----
 import java.util.concurrent.CompletableFuture;
 
 CompletableFuture<TrainingResult> future = loop.trainAsync(
@@ -199,23 +212,25 @@ CompletableFuture<TrainingResult> future = loop.trainAsync(
 
 TrainingResult result = future.join();
 System.out.printf("Final loss: %.4f%n", result.getFinalLoss());
-```
+----
 
 You can also compose the future:
 
-```java
+[source,java]
+----
 loop.trainAsync(() -> batches.iterator(), 10)
     .thenAccept(r -> System.out.println("Done! Loss: " + r.getFinalLoss()))
     .exceptionally(ex -> { ex.printStackTrace(); return null; });
-```
+----
 
----
+'''''
 
-## Loading MNIST Data
+=== Loading MNIST Data
 
 The MNIST dataset loader lives in `sk.ainet.data.mnist`. The `MNISTBlocking` class provides blocking (non-suspend) methods for Java:
 
-```java
+[source,java]
+----
 import sk.ainet.data.mnist.MNISTBlocking;
 import sk.ainet.data.mnist.MNISTDataset;
 
@@ -225,34 +240,37 @@ MNISTDataset test  = MNISTBlocking.loadTest();
 
 System.out.println("Training samples: " + train.getImages().size());  // 60000
 System.out.println("Test samples:     " + test.getImages().size());   // 10000
-```
+----
 
 The first call downloads the dataset from the internet and caches it. Subsequent calls load from disk.
 
-### Custom Cache Directory
+==== Custom Cache Directory
 
-```java
+[source,java]
+----
 import sk.ainet.data.mnist.MNISTLoaderConfig;
 
 MNISTLoaderConfig config = new MNISTLoaderConfig("/tmp/my-mnist-cache", true);
 MNISTDataset train = MNISTBlocking.loadTrain(config);
-```
+----
 
-### Working with MNIST Data
+==== Working with MNIST Data
 
-Each `MNISTDataset` contains a list of `MNISTImage` objects. Each image has a `byte[]` of 784 pixels (28x28) and a `byte` label (0-9):
+Each `MNISTDataset` contains a list of `MNISTImage` objects. Each image has a `byte++[]++` of 784 pixels (28x28) and a `byte` label (0-9):
 
-```java
+[source,java]
+----
 var firstImage = train.getImages().get(0);
 byte label = firstImage.getLabel();       // e.g. 5
 byte[] pixels = firstImage.getImage();    // 784 bytes, 0-255
-```
+----
 
-### Creating Tensor Batches
+==== Creating Tensor Batches
 
 To feed MNIST data into the training loop, convert images to tensors:
 
-```java
+[source,java]
+----
 import sk.ainet.java.SKaiNET;
 import sk.ainet.lang.types.DType;
 import kotlin.Pair;
@@ -287,15 +305,16 @@ for (int i = 0; i < images.size(); i += batchSize) {
     var y = SKaiNET.tensor(ctx, new int[]{actual}, DType.fp32(), yData);
     batches.add(new Pair<>(x, y));
 }
-```
+----
 
----
+'''''
 
-## Complete MNIST Training Example
+=== Complete MNIST Training Example
 
 Putting it all together:
 
-```java
+[source,java]
+----
 package com.example;
 
 import sk.ainet.java.*;
@@ -382,23 +401,25 @@ public class MnistTraining {
         return batches;
     }
 }
-```
+----
 
 Run with:
 
-```bash
+[source,bash]
+----
 java --enable-preview --add-modules jdk.incubator.vector \
      -cp target/classes:target/dependency/* \
      com.example.MnistTraining
-```
+----
 
----
+'''''
 
-## Async Training Example
+=== Async Training Example
 
 For non-blocking training, use `trainAsync()` and handle the result with `CompletableFuture`:
 
-```java
+[source,java]
+----
 var future = loop.trainAsync(() -> (Iterator) batches.iterator(), 10);
 
 // Monitor progress or do other work
@@ -408,23 +429,25 @@ future.thenAccept(result -> {
     System.out.printf("Finished: %d epochs, loss %.4f%n",
             result.getEpochs(), result.getFinalLoss());
 }).join();
-```
+----
 
----
+'''''
 
-## Package Reference
+=== Package Reference
 
-| Package               | Key Classes                                          |
-|-----------------------|------------------------------------------------------|
-| `sk.ainet.java`      | `SKaiNET`, `SequentialModelBuilder`, `TrainingLoop`, `TrainingResult`, `Losses`, `Optimizers`, `TensorJavaOps` |
-| `sk.ainet.data.mnist` | `MNISTBlocking`, `MNISTDataset`, `MNISTImage`, `MNISTLoaderConfig` |
-| `sk.ainet.lang.types` | `DType`                                              |
-| `sk.ainet.lang.nn.loss` | `Loss` (interface returned by `Losses` factory)    |
-| `sk.ainet.lang.nn.optim` | `Optimizer` (interface returned by `Optimizers` factory) |
+[cols=",",options="header",]
+|===
+|Package |Key Classes
+|`sk.ainet.java` |`SKaiNET`, `SequentialModelBuilder`, `TrainingLoop`, `TrainingResult`, `Losses`, `Optimizers`, `TensorJavaOps`
+|`sk.ainet.data.mnist` |`MNISTBlocking`, `MNISTDataset`, `MNISTImage`, `MNISTLoaderConfig`
+|`sk.ainet.lang.types` |`DType`
+|`sk.ainet.lang.nn.loss` |`Loss` (interface returned by `Losses` factory)
+|`sk.ainet.lang.nn.optim` |`Optimizer` (interface returned by `Optimizers` factory)
+|===
 
----
+'''''
 
-## Next Steps
+=== Next Steps
 
-- [Java Getting Started](java-getting-started.md) -- tensor operations, project setup, and dependency management.
-- [LLM Inference Guide](java-llm-inference.md) -- load GGUF/SafeTensors models, generate text, and build agents.
+* link:java-getting-started.md[Java Getting Started] -- tensor operations, project setup, and dependency management.
+* link:java-llm-inference.md[LLM Inference Guide] -- load GGUF/SafeTensors models, generate text, and build agents.
diff --git a/docs/modules/ROOT/pages/index.adoc b/docs/modules/ROOT/pages/index.adoc
new file mode 100644
index 00000000..fd1fda7e
--- /dev/null
+++ b/docs/modules/ROOT/pages/index.adoc
@@ -0,0 +1,34 @@
+= SKaiNET
+:description: Kotlin Multiplatform tensor engine with a graph IR, pluggable backends, and StableHLO export.
+
+SKaiNET is a Kotlin Multiplatform tensor / compile / graph engine.
+It provides a tensor DSL, execution contexts, a graph IR, model
+loaders (GGUF, SafeTensors, ONNX), quantization primitives
+(Q4_K, Q8_0, ternary, TurboQuant), a StableHLO emitter for cross-
+platform compile targets, and a pluggable backend API that CPU,
+GPU, and NPU backends can implement independently.
+
+This documentation site is organized following the
+https://diataxis.fr/[Diátaxis / Divio framework]:
+
+Tutorials:: Learning-oriented. Start here if you are new to SKaiNET.
+How-to guides:: Task-oriented. Recipes for solving specific problems.
+Reference:: Information-oriented. Looking up APIs and op coverage.
+Explanation:: Understanding-oriented. Background on design decisions.
+
+[NOTE]
+====
+LLM-specific runtimes (Llama, Gemma, Qwen, BERT) live in the
+sibling https://github.com/SKaiNET-developers/SKaiNET-transformers[SKaiNET-transformers]
+repository and its own documentation site. This site covers the
+engine layer only.
+====
+
+== Quick links
+
+* link:../api/index.html[API reference (Dokka)] (bundled at publish time)
+
+// The Tutorials / How-to / Reference / Explanation pages plus the
+// operator coverage xref land in follow-up commits (#2 and #3 of
+// the Antora migration). This page ships the landing copy first so
+// the scaffold build succeeds with a real start_page.
diff --git a/docs/modules/ROOT/pages/reference/api.adoc b/docs/modules/ROOT/pages/reference/api.adoc
new file mode 100644
index 00000000..ba400d1f
--- /dev/null
+++ b/docs/modules/ROOT/pages/reference/api.adoc
@@ -0,0 +1,19 @@
+= API Reference
+:description: Kotlin API reference generated by Dokka.
+
+The full Kotlin API reference for every SKaiNET module is
+generated by https://kotlinlang.org/docs/dokka-introduction.html[Dokka]
+and published as a sibling path of this documentation site.
+
+link:../api/index.html[Open the Dokka API reference, window=_blank]
+
+[NOTE]
+====
+The Dokka output is bundled into the published site by a
+`bundleDokkaIntoSite` Gradle task that runs **after** Antora
+writes the site. When you preview the site locally via
+`docker run ... antora ... docs/antora-playbook.yml`, the
+`/api/` path does not yet exist — run
+`./gradlew bundleDokkaIntoSite` to populate it before clicking
+through.
+====
diff --git a/docs/modules/ROOT/pages/reference/architecture.adoc b/docs/modules/ROOT/pages/reference/architecture.adoc
new file mode 100644
index 00000000..d350b26f
--- /dev/null
+++ b/docs/modules/ROOT/pages/reference/architecture.adoc
@@ -0,0 +1,11 @@
+= Architecture
+:description: How SKaiNET's compile and execution layers are organized.
+
+SKaiNET uses a hybrid backend strategy that separates development
+iteration from production deployment.
+
+image::SKaiNET-compiler.svg[Architecture diagram of the SKaiNET compiler pipeline]
+
+// The original ARCHITECTURE.md at the repo root was a 4-line stub
+// pointing at the compiler diagram. If you are looking for a
+// deeper architecture write-up, contribute it as a PR to this page.
diff --git a/docs/modules/ROOT/pages/reference/operators/generated/index.adoc b/docs/modules/ROOT/pages/reference/operators/generated/index.adoc
new file mode 100644
index 00000000..e64fe818
--- /dev/null
+++ b/docs/modules/ROOT/pages/reference/operators/generated/index.adoc
@@ -0,0 +1,14 @@
+= AI-NET Operators Reference
+
+Generated from version `1.0.0` on 2026-04-13
+
+== Operators by Modality
+
+=== Core
+
+* xref:reference/operators/generated/voidtensorops.adoc[VoidTensorOps]
+
+=== Composite
+
+* xref:reference/operators/generated/similarity.adoc[Similarity]
+
diff --git a/docs/modules/operators/_generated_/similarity.adoc b/docs/modules/ROOT/pages/reference/operators/generated/similarity.adoc
similarity index 100%
rename from docs/modules/operators/_generated_/similarity.adoc
rename to docs/modules/ROOT/pages/reference/operators/generated/similarity.adoc
diff --git a/docs/modules/operators/_generated_/voidtensorops.adoc b/docs/modules/ROOT/pages/reference/operators/generated/voidtensorops.adoc
similarity index 100%
rename from docs/modules/operators/_generated_/voidtensorops.adoc
rename to docs/modules/ROOT/pages/reference/operators/generated/voidtensorops.adoc
diff --git a/docs/modules/ROOT/pages/reference/ops-status-matrix.adoc b/docs/modules/ROOT/pages/reference/ops-status-matrix.adoc
new file mode 100644
index 00000000..6ee3957d
--- /dev/null
+++ b/docs/modules/ROOT/pages/reference/ops-status-matrix.adoc
@@ -0,0 +1,19 @@
+= Operator Coverage Matrix
+:description: Cross-backend status for every operator function in SKaiNET.
+
+Generated from `operators.json` version `1.0.0` on 2026-04-13.
+
+Rows are `Operator.function` pairs; columns are backends that appear in any function's `statusByBackend` map. A missing entry means the backend makes no claim about the function — treat it as "unknown", not "not supported".
+
+[cols="2,1,1,1,1", options="header"]
+|===
+| Operator.function | Metal | apple | cpu | wasm 
+
+| `VoidTensorOps.matmul` | 🚧 | — | — | — 
+| `VoidTensorOps.transpose` | 🚧 | — | — | — 
+| `Similarity.cosineDistance` | — | ✅ | ✅ | ✅ 
+
+| *Done* | *0 / 3* | *1 / 3* | *1 / 3* | *1 / 3* 
+|===
+
+Per-function detail including notes lives in xref:reference/operators/generated/index.adoc[Operator reference].
diff --git a/docs/graph-dsl.md b/docs/modules/ROOT/pages/tutorials/graph-dsl.adoc
similarity index 77%
rename from docs/graph-dsl.md
rename to docs/modules/ROOT/pages/tutorials/graph-dsl.adoc
index 8c65ac17..3112dfc5 100644
--- a/docs/graph-dsl.md
+++ b/docs/modules/ROOT/pages/tutorials/graph-dsl.adoc
@@ -1,12 +1,13 @@
-# SKaiNET Graph DSL
+== SKaiNET Graph DSL
 
 The SKaiNET Graph DSL provides a way to define complex directed acyclic graphs (DAGs) for machine learning models. Unlike the sequential `nn` DSL, the `dag` DSL allows for arbitrary wiring of nodes, multi-output graphs, and reusable modules.
 
-## Basic Usage
+=== Basic Usage
 
 To define a graph, use the `dag` block:
 
-```kotlin
+[source,kotlin]
+----
 val program = dag {
     val x = input<FP32>("input", TensorSpec("input", listOf(1, 3, 224, 224), "FP32"))
     
@@ -18,33 +19,35 @@ val program = dag {
     
     output(activated)
 }
-```
+----
 
-## Key Concepts
+=== Key Concepts
 
-### Inputs, Parameters, and Constants
+==== Inputs, Parameters, and Constants
 
-- `input<T>(name, spec)`: Defines an input node for the graph.
-- `parameter<T, V>(name) { ... }`: Defines a learnable parameter node. You can use a builder to specify shape and initialization.
-- `constant<T, V>(name) { ... }`: Defines a constant node (e.g., fixed biases or weights).
+* `input++<++T++>++(name, spec)`: Defines an input node for the graph.
+* `parameter++<++T, V++>++(name) ++{++ ... }`: Defines a learnable parameter node. You can use a builder to specify shape and initialization.
+* `constant++<++T, V++>++(name) ++{++ ... }`: Defines a constant node (e.g., fixed biases or weights).
 
-### Operations
+==== Operations
 
 Standard operations like `conv2d`, `relu`, `matmul`, `add`, etc., are available as extension functions within the `DagBuilder` (operations are in sync with TensorOps and implemented extention method via KSP).
 
-### Outputs
+==== Outputs
 
 A graph can have one or more outputs, defined using the `output()` function.
 
-```kotlin
+[source,kotlin]
+----
 output(branch1, branch2)
-```
+----
 
-## Reusable Modules
+=== Reusable Modules
 
 You can define reusable graph components using `dagModule`:
 
-```kotlin
+[source,kotlin]
+----
 val residualBlock = dagModule { inputs ->
     val x = inputs[0]
     val conv1 = conv2d(x, w1, b1, padding = 1 to 1)
@@ -59,25 +62,27 @@ val program = dag {
     val out = module(residualBlock, listOf(x))
     output(out[0])
 }
-```
+----
 
-## Compiling and Validating
+=== Compiling and Validating
 
 Once a `GraphProgram` is built, it can be converted to a `ComputeGraph` for execution or compilation:
 
-```kotlin
+[source,kotlin]
+----
 val graph = program.toComputeGraph()
 val validation = graph.validate()
 if (validation is ValidationResult.Valid) {
     // proceed to execution or compilation
 }
-```
+----
 
-## YOLO-style Example
+=== YOLO-style Example
 
 The Graph DSL is particularly useful for complex architectures like YOLO heads:
 
-```kotlin
+[source,kotlin]
+----
 val program = dag {
     val input = input<FP32>("input", TensorSpec("input", listOf(1, 3, 640, 640), "FP32"))
 
@@ -90,4 +95,4 @@ val program = dag {
 
     output(c2, head) // Multi-scale outputs
 }
-```
+----
diff --git a/docs/hlo-getting-started.md b/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc
similarity index 59%
rename from docs/hlo-getting-started.md
rename to docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc
index c0a116a4..d7d47a92 100644
--- a/docs/hlo-getting-started.md
+++ b/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc
@@ -1,34 +1,35 @@
-# Getting Started with HLO in SKaiNET
+== Getting Started with HLO in SKaiNET
 
-## What is HLO?
+=== What is HLO?
 
-HLO (High-Level Operations) is SKaiNET's intermediate representation for neural network computations, based on [StableHLO](https://github.com/openxla/stablehlo) - the portable high-level operation set for machine learning. HLO serves as a bridge between SKaiNET's Kotlin DSL and various execution backends, enabling optimizations and cross-platform deployment.
+HLO (High-Level Operations) is SKaiNET's intermediate representation for neural network computations, based on https://github.com/openxla/stablehlo[StableHLO] - the portable high-level operation set for machine learning. HLO serves as a bridge between SKaiNET's Kotlin DSL and various execution backends, enabling optimizations and cross-platform deployment.
 
-## Why MLIR/XLA Instead of Direct Backends?
+=== Why MLIR/XLA Instead of Direct Backends?
 
 SKaiNET uses the MLIR/XLA compilation approach rather than implementing separate backends for each hardware target. This design choice provides several key advantages:
 
-**Single Implementation Path**: Write operations once in Kotlin, compile to StableHLO MLIR, then let XLA handle hardware-specific optimizations. No need to maintain separate CUDA, Metal, or ROCm implementations.
+*Single Implementation Path*: Write operations once in Kotlin, compile to StableHLO MLIR, then let XLA handle hardware-specific optimizations. No need to maintain separate CUDA, Metal, or ROCm implementations.
 
-**Automatic Optimization**: XLA provides sophisticated optimizations like operator fusion, memory layout optimization, and hardware-specific kernel selection without manual tuning.
+*Automatic Optimization*: XLA provides sophisticated optimizations like operator fusion, memory layout optimization, and hardware-specific kernel selection without manual tuning.
 
-**Future-Proof**: New hardware targets (like future GPU architectures) are automatically supported when XLA adds support, without requiring SKaiNET updates.
+*Future-Proof*: New hardware targets (like future GPU architectures) are automatically supported when XLA adds support, without requiring SKaiNET updates.
 
-**Ecosystem Integration**: Full compatibility with JAX, TensorFlow, and other MLIR-based frameworks enables model sharing and toolchain reuse.
+*Ecosystem Integration*: Full compatibility with JAX, TensorFlow, and other MLIR-based frameworks enables model sharing and toolchain reuse.
 
-### Key Benefits
+==== Key Benefits
 
-- **Portability**: Write once, compile to any XLA-supported hardware (CPU, GPU, TPU)
-- **Optimization**: Leverage XLA's advanced compiler optimizations and operator fusion
-- **Interoperability**: Full compatibility with XLA, JAX, TensorFlow, and MLIR ecosystems
-- **Performance**: Hardware-specific optimizations without manual kernel development
-- **No Backend Lock-in**: Single compilation target supports all hardware through XLA
+* *Portability*: Write once, compile to any XLA-supported hardware (CPU, GPU, TPU)
+* *Optimization*: Leverage XLA's advanced compiler optimizations and operator fusion
+* *Interoperability*: Full compatibility with XLA, JAX, TensorFlow, and MLIR ecosystems
+* *Performance*: Hardware-specific optimizations without manual kernel development
+* *No Backend Lock-in*: Single compilation target supports all hardware through XLA
 
-## Architecture Overview
+=== Architecture Overview
 
 SKaiNET's HLO compilation pipeline transforms high-level Kotlin DSL operations into hardware-optimized executable code through the MLIR/XLA ecosystem:
 
-```mermaid
+[mermaid]
+----
 graph TD
     A[Kotlin DSL] --> B[Compute Graph]
     B --> C[HLO Converter]
@@ -58,11 +59,12 @@ graph TD
     style A fill:#e1f5fe
     style D fill:#f3e5f5
     style F fill:#e8f5e8
-```
+----
 
-### Data Flow Architecture
+==== Data Flow Architecture
 
-```mermaid
+[mermaid]
+----
 flowchart LR
     subgraph "Input Layer"
         DSL[Kotlin DSL Code]
@@ -92,48 +94,53 @@ flowchart LR
     style DSL fill:#bbdefb
     style Conv fill:#c8e6c9
     style MLIR fill:#ffcdd2
-```
+----
 
-## Building Blocks
+=== Building Blocks
 
-### 1. HLO Converters
+[[1-hlo-converters]]
+==== 1. HLO Converters
 
 Converters transform SKaiNET operations into StableHLO operations:
 
-- **MathOperationsConverter**: Basic arithmetic operations
-- **LinalgOperationsConverter**: Linear algebra operations  
-- **ActivationOperationsConverter**: Neural network activations
-- **NeuralNetOperationsConverter**: High-level NN operations
-- **ConstantOperationsConverter**: Constant value operations
+* *MathOperationsConverter*: Basic arithmetic operations
+* *LinalgOperationsConverter*: Linear algebra operations
+* *ActivationOperationsConverter*: Neural network activations
+* *NeuralNetOperationsConverter*: High-level NN operations
+* *ConstantOperationsConverter*: Constant value operations
 
-### 2. Type System
+[[2-type-system]]
+==== 2. Type System
 
 HLO uses a strict type system for tensors:
 
-```kotlin
+[source,kotlin]
+----
 // SKaiNET tensor type
 Tensor<Float32, Shape4D> // Batch, Channel, Height, Width
 
 // Converts to HLO type
 tensor<1x3x224x224xf32> // StableHLO representation
-```
+----
 
-### 3. Optimization Framework
+[[3-optimization-framework]]
+==== 3. Optimization Framework
 
 The optimization pipeline includes:
 
-- **Shape inference and propagation**
-- **Constant folding and dead code elimination**
-- **Operation fusion for performance**
-- **Memory layout optimization**
+* *Shape inference and propagation*
+* *Constant folding and dead code elimination*
+* *Operation fusion for performance*
+* *Memory layout optimization*
 
-## Practical Example: RGB to Grayscale Conversion
+=== Practical Example: RGB to Grayscale Conversion
 
-Let's walk through converting a color image tensor `Tensor<B,C,H,W>` to grayscale using matrix multiplication.
+Let's walk through converting a color image tensor `Tensor++<++B,C,H,W++>++` to grayscale using matrix multiplication.
 
-### Step 1: Define the Operation in Kotlin DSL
+==== Step 1: Define the Operation in Kotlin DSL
 
-```kotlin
+[source,kotlin]
+----
 // From: skainet-lang/skainet-lang-models/src/commonMain/kotlin/sk/ainet/lang/model/compute/Rgb2GrayScaleMultiply.kt
 fun Tensor<Float32, Shape4D>.rgb2GrayScaleMatMul(): Tensor<Float32, Shape4D> {
     // RGB to grayscale weights: [0.299, 0.587, 0.114]
@@ -151,13 +158,14 @@ fun Tensor<Float32, Shape4D>.rgb2GrayScaleMatMul(): Tensor<Float32, Shape4D> {
     // Reshape back to [B,1,H,W]
     return gray.transpose(intArrayOf(0, 3, 1, 2))
 }
-```
+----
 
-### Step 2: HLO Conversion Process
+==== Step 2: HLO Conversion Process
 
 The conversion pipeline transforms this operation:
 
-```mermaid
+[mermaid]
+----
 sequenceDiagram
     participant DSL as Kotlin DSL
     participant DAG as Compute Graph
@@ -173,13 +181,14 @@ sequenceDiagram
     Opt->>HLO: Optimized IR
     
     Note over Conv,HLO: Type inference:<br/>tensor<BxCxHxWxf32> → tensor<Bx1xHxWxf32>
-```
+----
 
-### Step 3: Generated StableHLO IR
+==== Step 3: Generated StableHLO IR
 
 The converter produces MLIR code like this:
 
-```mlir
+[source,mlir]
+----
 func.func @rgb2grayscale(%input: tensor<?x3x?x?xf32>) -> tensor<?x1x?x?xf32> {
   // Define grayscale conversion weights
   %weights = stablehlo.constant dense<[[0.299], [0.587], [0.114]]> : tensor<3x1xf32>
@@ -199,28 +208,30 @@ func.func @rgb2grayscale(%input: tensor<?x3x?x?xf32>) -> tensor<?x1x?x?xf32> {
   
   return %result : tensor<?x1x?x?xf32>
 }
-```
+----
 
-## Hardware Target Compilation via XLA
+=== Hardware Target Compilation via XLA
 
 SKaiNET uses the MLIR/XLA compilation pipeline to target different hardware platforms without requiring separate backend implementations. The StableHLO IR serves as a portable intermediate representation that XLA can compile to optimized code for various targets.
 
-### Supported Hardware Targets
+==== Supported Hardware Targets
 
-- **CPU**: x86_64, ARM64 (via XLA CPU backend)
-- **GPU**: NVIDIA CUDA, AMD ROCm (via XLA GPU backend)  
-- **TPU**: Google TPUs (via XLA TPU backend)
-- **Mobile**: iOS Metal, Android GPU (via XLA mobile backends)
+* *CPU*: x86++_++64, ARM64 (via XLA CPU backend)
+* *GPU*: NVIDIA CUDA, AMD ROCm (via XLA GPU backend)
+* *TPU*: Google TPUs (via XLA TPU backend)
+* *Mobile*: iOS Metal, Android GPU (via XLA mobile backends)
 
-### Prerequisites for GPU Compilation
+==== Prerequisites for GPU Compilation
 
-1. **XLA with GPU support**: [Installation guide](https://www.tensorflow.org/xla/tutorials/compile)
-2. **NVIDIA CUDA Toolkit** (for NVIDIA GPUs): [Download here](https://developer.nvidia.com/cuda-downloads)
-3. **ROCm** (for AMD GPUs): [Installation guide](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html)
+[arabic]
+. *XLA with GPU support*: https://www.tensorflow.org/xla/tutorials/compile[Installation guide]
+. *NVIDIA CUDA Toolkit* (for NVIDIA GPUs): https://developer.nvidia.com/cuda-downloads[Download here]
+. *ROCm* (for AMD GPUs): https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html[Installation guide]
 
-### Step 1: Generate StableHLO IR
+==== Step 1: Generate StableHLO IR
 
-```bash
+[source,bash]
+----
 # Build SKaiNET HLO compiler
 ./gradlew :skainet-compile:skainet-compile-hlo:build
 
@@ -228,11 +239,12 @@ SKaiNET uses the MLIR/XLA compilation pipeline to target different hardware plat
 ./gradlew :skainet-compile:skainet-compile-hlo:generateHlo \
   -Pmodel=rgb2grayscale \
   -Poutput=rgb2grayscale.mlir
-```
+----
 
-### Step 2: Compile with XLA for Target Hardware
+==== Step 2: Compile with XLA for Target Hardware
 
-```bash
+[source,bash]
+----
 # Compile to GPU executable (NVIDIA CUDA)
 xla_compile \
   --input_format=mlir \
@@ -257,24 +269,26 @@ xla_compile \
   --platform=tpu \
   --input_file=rgb2grayscale.mlir \
   --output_file=rgb2grayscale_tpu.so
-```
+----
 
-### Step 3: Runtime Execution
+==== Step 3: Runtime Execution
 
-```bash
+[source,bash]
+----
 # Execute on target hardware using XLA runtime
 xla_run \
   --executable=rgb2grayscale_cuda.so \
   --input=image.jpg \
   --output=gray.jpg \
   --device=gpu:0
-```
+----
 
-### Jetson and Edge Device Deployment
+==== Jetson and Edge Device Deployment
 
 For NVIDIA Jetson and other edge devices, the same MLIR → XLA compilation approach applies:
 
-```bash
+[source,bash]
+----
 # Cross-compile for ARM64 with CUDA support
 xla_compile \
   --input_format=mlir \
@@ -292,15 +306,16 @@ scp rgb2grayscale_jetson.so jetson@192.168.1.100:~/models/
 ssh jetson@192.168.1.100
 cd ~/models
 xla_run --executable=rgb2grayscale_jetson.so --device=gpu:0
-```
+----
 
-## Advanced Topics
+=== Advanced Topics
 
-### Custom HLO Operations
+==== Custom HLO Operations
 
 Extend SKaiNET with custom operations:
 
-```kotlin
+[source,kotlin]
+----
 // Define custom operation
 @HloOperation("custom.rgb_enhance")
 class RgbEnhanceOp : HloConverter {
@@ -311,34 +326,36 @@ class RgbEnhanceOp : HloConverter {
         """
     }
 }
-```
+----
 
-### Debugging HLO
+==== Debugging HLO
 
 Use SKaiNET's built-in debugging tools:
 
-```kotlin
+[source,kotlin]
+----
 // Enable HLO debugging
 val optimizer = StableHloOptimizer(debugMode = true)
 val optimizedHlo = optimizer.optimize(hloModule)
 
 // Visualize computation graph
 optimizer.dumpGraphviz("rgb2gray.dot")
-```
+----
 
-## Resources and References
+=== Resources and References
 
-- [StableHLO Specification](https://github.com/openxla/stablehlo/blob/main/docs/spec.md)
-- [MLIR Documentation](https://mlir.llvm.org/docs/)
-- [XLA Compilation Guide](https://www.tensorflow.org/xla)
-- [NVIDIA Jetson Documentation](https://docs.nvidia.com/jetson/)
-- [SKaiNET HLO Examples](./examples/hlo/)
+* https://github.com/openxla/stablehlo/blob/main/docs/spec.md[StableHLO Specification]
+* https://mlir.llvm.org/docs/[MLIR Documentation]
+* https://www.tensorflow.org/xla[XLA Compilation Guide]
+* https://docs.nvidia.com/jetson/[NVIDIA Jetson Documentation]
+* link:./examples/hlo/[SKaiNET HLO Examples]
 
-## Next Steps
+=== Next Steps
 
-1. **Explore Examples**: Check `skainet-compile/skainet-compile-hlo/src/commonMain/kotlin/sk/ainet/compile/hlo/examples/`
-2. **Run Tests**: Execute `./gradlew :skainet-compile:skainet-compile-hlo:test`
-3. **Contribute**: Add new HLO converters for missing operations
-4. **Optimize**: Profile and optimize your models using HLO tools
+[arabic]
+. *Explore Examples*: Check `skainet-compile/skainet-compile-hlo/src/commonMain/kotlin/sk/ainet/compile/hlo/examples/`
+. *Run Tests*: Execute `./gradlew :skainet-compile:skainet-compile-hlo:test`
+. *Contribute*: Add new HLO converters for missing operations
+. *Optimize*: Profile and optimize your models using HLO tools
 
-For more detailed information, see the [HLO Optimization Guide](./OPTIMIZATION.md) and [API Documentation](https://docs.skainet.sk/hlo/).
\ No newline at end of file
+For more detailed information, see the link:./OPTIMIZATION.md[HLO Optimization Guide] and https://docs.skainet.sk/hlo/[API Documentation].
diff --git a/docs/java-getting-started.md b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
similarity index 78%
rename from docs/java-getting-started.md
rename to docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
index e64be280..003a6d46 100644
--- a/docs/java-getting-started.md
+++ b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
@@ -1,31 +1,33 @@
-# Java Getting Started Guide
+== Java Getting Started Guide
 
 This guide gets you from zero to running tensor operations with SKaiNET in under 5 minutes. SKaiNET is a Kotlin Multiplatform AI framework, but every JVM-facing API is designed for idiomatic Java usage -- no Kotlin knowledge required.
 
-## Prerequisites
+=== Prerequisites
 
-- **JDK 21 or later** (required for Vector API and virtual threads)
-- **Maven 3.8+** or **Gradle 8.4+**
+* *JDK 21 or later* (required for Vector API and virtual threads)
+* *Maven 3.8{plus}* or *Gradle 8.4{plus}*
 
-## JVM Flags
+=== JVM Flags
 
 SKaiNET uses the Java Vector API for SIMD-accelerated tensor operations. You must pass two flags every time you run your application:
 
-```
+....
 --enable-preview --add-modules jdk.incubator.vector
-```
+....
 
-For Maven Surefire / exec-maven-plugin, add them to `<jvmArgs>`. For Gradle, add them to `jvmArgs` in your run task. Examples are shown below.
+For Maven Surefire / exec-maven-plugin, add them to `++<++jvmArgs++>++`. For Gradle, add them to `jvmArgs` in your run task. Examples are shown below.
 
----
+'''''
 
-## Maven Setup
+=== Maven Setup
 
-### 1. Import the BOM
+[[1-import-the-bom]]
+==== 1. Import the BOM
 
-The `skainet-bom` manages all SKaiNET module versions so you never have to keep them in sync manually. Add it to your `<dependencyManagement>` section:
+The `skainet-bom` manages all SKaiNET module versions so you never have to keep them in sync manually. Add it to your `++<++dependencyManagement++>++` section:
 
-```xml
+[source,xml]
+----
 <project>
     <properties>
         <skainet.version>0.13.0</skainet.version>
@@ -76,13 +78,15 @@ The `skainet-bom` manages all SKaiNET module versions so you never have to keep
         </plugins>
     </build>
 </project>
-```
+----
 
-### 2. Add More Modules as Needed
+[[2-add-more-modules-as-needed]]
+==== 2. Add More Modules as Needed
 
 Because the BOM is imported, you can add any module without repeating the version:
 
-```xml
+[source,xml]
+----
 <!-- LLM inference (KLlama) -->
 <dependency>
     <groupId>sk.ainet</groupId>
@@ -106,13 +110,14 @@ Because the BOM is imported, you can add any module without repeating the versio
     <groupId>sk.ainet</groupId>
     <artifactId>skainet-kllama-agent-jvm</artifactId>
 </dependency>
-```
+----
 
----
+'''''
 
-## Gradle Kotlin DSL Setup
+=== Gradle Kotlin DSL Setup
 
-```kotlin
+[source,kotlin]
+----
 plugins {
     java
     application
@@ -144,15 +149,16 @@ application {
 tasks.withType<JavaCompile> {
     options.compilerArgs.addAll(listOf("--enable-preview"))
 }
-```
+----
 
----
+'''''
 
-## Hello Tensor
+=== Hello Tensor
 
 Create `src/main/java/com/example/HelloTensor.java`:
 
-```java
+[source,java]
+----
 package com.example;
 
 import sk.ainet.java.SKaiNET;
@@ -186,39 +192,43 @@ public class HelloTensor {
         System.out.println("after relu:          " + d);
     }
 }
-```
+----
 
 Run it:
 
-```bash
+[source,bash]
+----
 # Maven
 mvn compile exec:java
 
 # Gradle
 ./gradlew run
-```
+----
 
----
+'''''
 
-## Key Entry Points
+=== Key Entry Points
 
 All Java-facing classes live in the `sk.ainet.java` package:
 
-| Class             | Purpose                                                |
-|-------------------|--------------------------------------------------------|
-| `SKaiNET`         | Static factory -- `context()`, `tensor()`, `zeros()`, `ones()`, `randn()`, `full()` |
-| `TensorJavaOps`   | Static tensor ops -- `matmul()`, `relu()`, `softmax()`, `add()`, `reshape()`, ... |
-| `Losses`          | Loss function factory -- `crossEntropy()`, `mse()`, `binaryCrossEntropy()`, ...     |
-| `Optimizers`      | Optimizer factory -- `adam()`, `adamw()`, `sgd()`                                    |
-| `DType`           | Data type selectors -- `DType.fp32()`, `DType.fp16()`, `DType.bf16()`, `DType.int32()`, ... |
+[cols=",",options="header",]
+|===
+|Class |Purpose
+|`SKaiNET` |Static factory -- `context()`, `tensor()`, `zeros()`, `ones()`, `randn()`, `full()`
+|`TensorJavaOps` |Static tensor ops -- `matmul()`, `relu()`, `softmax()`, `add()`, `reshape()`, ...
+|`Losses` |Loss function factory -- `crossEntropy()`, `mse()`, `binaryCrossEntropy()`, ...
+|`Optimizers` |Optimizer factory -- `adam()`, `adamw()`, `sgd()`
+|`DType` |Data type selectors -- `DType.fp32()`, `DType.fp16()`, `DType.bf16()`, `DType.int32()`, ...
+|===
 
----
+'''''
 
-## Data Types
+=== Data Types
 
 Access data types through static methods on `DType` (from `sk.ainet.lang.types`):
 
-```java
+[source,java]
+----
 import sk.ainet.lang.types.DType;
 
 DType f32  = DType.fp32();    // 32-bit float (default)
@@ -229,15 +239,16 @@ DType i8   = DType.int8();    // 8-bit integer
 DType i32  = DType.int32();   // 32-bit integer
 DType i64  = DType.int64();   // 64-bit integer
 DType u8   = DType.uint8();   // unsigned 8-bit
-```
+----
 
-You can also use the constant fields if you prefer: `DType.FP32_TYPE`, `DType.INT32_TYPE`, etc.
+You can also use the constant fields if you prefer: `DType.FP32++_++TYPE`, `DType.INT32++_++TYPE`, etc.
 
----
+'''''
 
-## Common Tensor Operations
+=== Common Tensor Operations
 
-```java
+[source,java]
+----
 var ctx = SKaiNET.context();
 
 // Creation
@@ -277,11 +288,11 @@ var flat = TensorJavaOps.flatten(a);
 var resh = TensorJavaOps.reshape(a, new int[]{1, -1});
 var sq   = TensorJavaOps.squeeze(a, 0);
 var usq  = TensorJavaOps.unsqueeze(a, 0);
-```
+----
 
----
+'''''
 
-## Next Steps
+=== Next Steps
 
-- [LLM Inference Guide](java-llm-inference.md) -- load GGUF/SafeTensors models, generate text, run BERT embeddings, and build tool-calling agents.
-- [Model Training Guide](java-model-training.md) -- build sequential models, train on MNIST, and run async training loops.
+* link:java-llm-inference.md[LLM Inference Guide] -- load GGUF/SafeTensors models, generate text, run BERT embeddings, and build tool-calling agents.
+* link:java-model-training.md[Model Training Guide] -- build sequential models, train on MNIST, and run async training loops.
diff --git a/docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc
new file mode 100644
index 00000000..153d32ef
--- /dev/null
+++ b/docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc
@@ -0,0 +1,26 @@
+== KLlama Getting Started
+
+KLlama is a pure Kotlin LLaMA inference runtime that runs on JVM, Native, JS, and WebAssembly. It supports GGUF, SafeTensors, and Karpathy .bin model formats with on-the-fly quantization support.
+
+____
+*Early Stage Development*: The project is in active development. We appreciate your feedback and bug reports!
+____
+
+=== Choose Your Path
+
+[cols=",",options="header",]
+|===
+|Goal |Guide
+|*Run models from the command line* |link:../skainet-apps/skainet-kllama-cli/README.md[KLlama CLI]
+|*Embed in a Kotlin application* |link:../skainet-apps/skainet-kllama/README.md[KLlama Library]
+|*Embed in a Java application* |link:java-llm-inference.md[Java LLM Inference Guide]
+|*Build a standalone Java CLI app* |link:java-cli-app.md[Java CLI App Guide]
+|*Java project setup (Maven / Gradle)* |link:java-getting-started.md[Java Getting Started]
+|===
+
+=== Quick Links
+
+* link:++../skainet-apps/skainet-kllama/README.md#supported-formats--quantization++[Supported formats & quantization]
+* link:../skainet-apps/skainet-kllama/README.md#custom-backend-integration[Custom backend integration]
+* link:java-llm-inference.md#agent-loop-and-tool-calling[Agent & tool calling]
+* link:java-llm-inference.md#bert-encoding-and-similarity[BERT embeddings & similarity]
diff --git a/docs/modules/operators/_generated_/index.adoc b/docs/modules/operators/_generated_/index.adoc
deleted file mode 100644
index e172df10..00000000
--- a/docs/modules/operators/_generated_/index.adoc
+++ /dev/null
@@ -1,14 +0,0 @@
-= AI-NET Operators Reference
-
-Generated from version `1.0.0` on 2026-03-03
-
-== Operators by Modality
-
-=== Core
-
-* xref:voidtensorops.adoc[VoidTensorOps]
-
-=== Composite
-
-* xref:similarity.adoc[Similarity]
-
diff --git a/docs/nav.adoc b/docs/nav.adoc
deleted file mode 100644
index f23df7ae..00000000
--- a/docs/nav.adoc
+++ /dev/null
@@ -1,50 +0,0 @@
-= SKaiNET Documentation Navigation
-
-[#main-nav]
-== Main Navigation
-
-* xref:theory/index.adoc[Mathematical Theory]
-** xref:theory/matmul.adoc[Matrix Multiplication]
-* xref:examples/index.adoc[Usage Examples]  
-** xref:examples/matmul-examples.adoc[Matrix Multiplication Examples]
-* xref:modules/operators/_generated_/index.adoc[Generated API Reference]
-
-[#quick-reference]
-== Quick Reference
-
-=== Core Operations
-* xref:theory/matmul.adoc#matmul-definition[Matrix Multiplication Theory]
-* xref:examples/matmul-examples.adoc#basic-usage[Basic Matrix Multiplication]
-* xref:examples/matmul-examples.adoc#neural-network[Neural Network Applications]
-
-=== Documentation Structure
-* `docs/theory/` - Mathematical definitions and theoretical foundations
-* `docs/examples/` - Practical usage examples and code samples
-* `docs/modules/operators/_generated_/` - Auto-generated API reference
-
-[#toc-template]
-== Table of Contents Template
-
-The following template can be used for generating table of contents in documentation pages:
-
-----
-[discrete]
-== Table of Contents
-
-* <<section-anchor,Section Name>>
-** <<subsection-anchor,Subsection Name>>
-* <<another-section,Another Section>>
-----
-
-[#cross-reference-patterns]
-== Cross-Reference Patterns
-
-=== Internal Links
-* Theory to Examples: `xref:../examples/matmul-examples.adoc#basic-usage[Matrix Multiplication Examples]`
-* Examples to Theory: `xref:../theory/matmul.adoc#matmul-definition[Mathematical Definition]`
-* Generated to Human: `xref:../../theory/index.adoc[Theory Reference]`
-
-=== Anchor Naming Conventions
-* Theory anchors: `#operation-definition`, `#operation-properties`, `#operation-complexity`
-* Example anchors: `#basic-usage`, `#advanced-usage`, `#performance-tips`
-* Generated anchors: `#operator-{name}`, `#function-{operator}-{function}`
\ No newline at end of file
diff --git a/docs/perf/java-25-cpu-backend.md b/docs/perf/java-25-cpu-backend.md
deleted file mode 100644
index e66f588d..00000000
--- a/docs/perf/java-25-cpu-backend.md
+++ /dev/null
@@ -1,99 +0,0 @@
-### Java 25 Advantages for the JVM CPU Backend
-
-Java 25 (GA September 2025) delivers significant free performance improvements to the
-SKaiNET JVM CPU backend through JIT/C2 optimizations, faster Panama FFI, and new GC/startup
-features — all without requiring code changes.
-
-#### Compatibility
-
-The same code, same flags, and same runtime detection work across JDK 21–25:
-
-- Vector API remains incubator on JDK 25 (JEP 508) — identical `jdk.incubator.vector` package.
-- Panama FFI finalized in JDK 22; `--enable-preview` is harmless on 22+.
-- Runtime detection (`Class.forName`, `Runtime.version()`) works on all versions.
-- Build config (`jvmTarget = JVM_21`, `options.release.set(21)`) produces compatible bytecode.
-
-**No special treatment is needed for JDK >= 21 but < 25.**
-
-Required flags remain:
-```
---enable-preview --add-modules jdk.incubator.vector
-```
-
-#### JIT / C2 improvements mapped to SKaiNET ops
-
-These are automatic — the JIT produces better native code for existing bytecode.
-
-| Improvement | JDK bug | Speedup | Affected SKaiNET code |
-|---|---|---|---|
-| VPointer refactoring for vector loads/stores | [JDK-8350748](https://bugs.openjdk.org/browse/JDK-8350748) | up to 14x | All `FloatVector.fromArray` / `fromMemorySegment` loops in `JvmVectorKernels.kt`, `JvmQuantizedVectorKernels.kt` |
-| SuperWord SIMD enhancement | [JDK-8343685](https://bugs.openjdk.org/browse/JDK-8343685) | up to 33x | Same vectorized loops (elementwise, reductions, matmul inner loops) |
-| `Math.max` / `Math.min` intrinsified for `long` | JDK-8350485 | 3–5x | Shape computation, tile clamping in blocked matmul |
-
-Source files:
-- `skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmVectorKernels.kt`
-- `skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt`
-
-#### Panama FFI improvements
-
-| Improvement | JDK bug | Speedup | Affected SKaiNET code |
-|---|---|---|---|
-| Faster `MemorySegment` allocation | [JDK-8345687](https://bugs.openjdk.org/browse/JDK-8345687) | ~2x | `MemorySegmentTensorData.kt` (`MemorySegmentTensorDataFactory`), `PagedKvCache.kt` |
-| `MemorySegment::fill` optimized on AArch64 | [JDK-8354674](https://bugs.openjdk.org/browse/JDK-8354674) | ~2.5x | Tensor zeroing, blocked matmul result initialization |
-
-Source files:
-- `skainet-lang/skainet-lang-core/src/jvmMain/kotlin/sk/ainet/lang/tensor/data/MemorySegmentTensorData.kt`
-- `skainet-apps/skainet-kllama/src/jvmMain/kotlin/sk/ainet/apps/kllama/PagedKvCache.kt`
-
-#### Object layout and GC
-
-- **Compact Object Headers** (JEP 519) — reduces object header from 12 to 8 bytes.
-  Meaningful for tensor metadata arrays with millions of small objects.
-  Opt-in: `-XX:+UseCompactObjectHeaders`
-
-- **Generational Shenandoah** (JEP 521) — lower GC pause times for allocation-heavy
-  workloads (tensor creation, KV cache churn).
-  Opt-in: `-XX:+UseShenandoahGC -XX:ShenandoahGCMode=generational`
-
-#### Startup and warmup
-
-- **AOT profiling / caching** (JEP 515) — records JIT profile data from a training run
-  and replays it on subsequent launches. Reduces warmup by 15–25%.
-  Useful for CLI apps like kLLaMA where first-token latency matters.
-
-Usage:
-```
-# Training run (records profile)
-java -XX:AOTCacheOutput=app.aot -jar kllama.jar --prompt "warmup"
-
-# Production run (replays profile)
-java -XX:AOTCache=app.aot -jar kllama.jar --prompt "Hello"
-```
-
-#### Recommended JVM flags for Java 25
-
-Required (same as JDK 21–24):
-```
---enable-preview
---add-modules jdk.incubator.vector
-```
-
-Optional — enable for maximum benefit on JDK 25:
-```
--XX:+UseCompactObjectHeaders
--XX:+UseShenandoahGC -XX:ShenandoahGCMode=generational
--XX:AOTCache=app.aot          # after training run
-```
-
-#### Summary
-
-| Feature | Benefit | Component |
-|---|---|---|
-| VPointer refactoring (C2) | Up to 14x faster vector loads/stores | `JvmVectorKernels`, `JvmQuantizedVectorKernels` |
-| SuperWord SIMD (C2) | Up to 33x faster auto-vectorized loops | Same vector kernel files |
-| `Math.max/min` intrinsic | 3–5x faster long comparisons | Shape computation, tile clamping |
-| Faster segment allocation | ~2x allocation throughput | `MemorySegmentTensorDataFactory`, `PagedKvCache` |
-| `MemorySegment::fill` (AArch64) | ~2.5x faster bulk zeroing | Tensor init, matmul result buffers |
-| Compact Object Headers | ~30% smaller object headers | All tensor metadata |
-| Generational Shenandoah | Lower GC pauses | Allocation-heavy inference |
-| AOT profiling | 15–25% faster warmup | CLI apps (kLLaMA) |
diff --git a/docs/perf/jvm-cpu.md b/docs/perf/jvm-cpu.md
deleted file mode 100644
index fc981566..00000000
--- a/docs/perf/jvm-cpu.md
+++ /dev/null
@@ -1,94 +0,0 @@
-### JVM CPU Backend Performance Benchmarks (JMH)
-
-This page explains how to run the JMH benchmarks for the JVM CPU backend and how to capture evidence for performance targets.
-
-#### What’s included
-- Elementwise: FP32 `add` on 1,000,000 elements
-- Reductions: FP32 `sum` and `mean` on 1,000,000 elements
-- Matmul: FP32 square `matmul` with sizes 256, 512, and 1024
-
-Benchmarks are implemented in module:
-- `:skainet-backends:benchmarks:jvm-cpu-jmh`
-
-Source files:
-- `src/jmh/kotlin/sk/ainet/bench/ElementwiseAdd1MBench.kt`
-- `src/jmh/kotlin/sk/ainet/bench/Reductions1MBench.kt`
-- `src/jmh/kotlin/sk/ainet/bench/MatmulBench.kt`
-
-#### Prerequisites
-- JDK 21+ (JDK 22 toolchain configured by Gradle)
-- Gradle will pass required JVM flags:
-  - `--enable-preview`
-  - `--add-modules jdk.incubator.vector`
-
-For Java 25-specific performance advantages, see [Java 25 CPU Backend](java-25-cpu-backend.md).
-
-#### Feature flags
-You can toggle acceleration paths at runtime using system properties or environment variables:
-- Vector acceleration:
-  - `-Dskainet.cpu.vector.enabled=true|false`
-  - or `SKAINET_CPU_VECTOR_ENABLED=true|false`
-- BLAS via Panama (matmul heuristic for larger sizes):
-  - `-Dskainet.cpu.blas.enabled=true|false`
-  - or `SKAINET_CPU_BLAS_ENABLED=true|false`
-
-Each benchmark also exposes `@Param` to toggle these flags without modifying Gradle args.
-
-#### How to run all benchmarks
-From repository root:
-
-```
-./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh
-```
-
-This will build and execute all JMH benchmarks with the default parameters defined in sources.
-
-#### Run specific benchmarks
-- Elementwise add (both vector on/off):
-```
-./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
-  -Pjmh.include=ElementwiseAdd1MBench
-```
-
-- Reductions (vector on/off):
-```
-./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
-  -Pjmh.include=Reductions1MBench
-```
-
-- Matmul, all sizes, with vector on and BLAS on:
-```
-./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
-  -Pjmh.include=MatmulBench \
-  -Pjmh.param.vectorEnabled=true \
-  -Pjmh.param.blasEnabled=true
-```
-
-- Matmul at 512 only, comparing BLAS on/off with vector on:
-```
-./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \
-  -Pjmh.include=MatmulBench \
-  -Pjmh.param.size=512 \
-  -Pjmh.param.vectorEnabled=true \
-  -Pjmh.param.blasEnabled=true,false
-```
-
-Notes:
-- You can also pass system properties via `-D` if preferred (e.g., `-Dskainet.cpu.vector.enabled=false`).
-- JMH JSON/text results can be configured via standard JMH plugin options if you need files for CI artifacts.
-
-#### Recording environment details
-Include at minimum:
-- CPU model, cores/threads, base/boost clock
-- RAM size and speed
-- OS version
-- JDK version and vendor
-- Gradle version
-- JVM flags in use (`--enable-preview --add-modules jdk.incubator.vector`)
-- SKaiNET flags used (vector, BLAS)
-
-#### Performance targets (to be validated on your hardware)
-- ≥ 4× speedup on FP32 `matmul` 512×512 vs baseline scalar
-- ≥ 3× speedup on FP32 `add` with 1M elements vs baseline scalar
-
-Use the above commands to produce “vector=false/blas=false” baselines vs “vector=true[/blas=true]” accelerated runs. Capture best-of or median-of JMH results as evidence and include raw tables in this document when available.
diff --git a/docs/skainet-4-ai.md b/docs/skainet-4-ai.md
deleted file mode 100644
index d9d2c6a6..00000000
--- a/docs/skainet-4-ai.md
+++ /dev/null
@@ -1,127 +0,0 @@
-# SKaiNET Core Technology: Tensor & Data Guide
-
-This document provides technical instructions for AI agents and developers on using SKaiNET's Tensor and Data API as a modern, type-safe replacement for NDArray or Python's NumPy library.
-
-## 1. Fundamental Architecture: Tensor Composition
-
-Unlike traditional libraries where a Tensor is a monolithic object, SKaiNET adopts a **compositional architecture**. A `Tensor<T, V>` is composed of two primary components:
-
-1.  **`TensorData<T, V>`**: Handles multi-dimensional storage, memory layout, indexing, and type-safe element access.
-2.  **`TensorOps`**: Encapsulates mathematical algorithms and transformations (CPU, GPU, etc.).
-
-This separation allows for high flexibility, such as switching execution backends without changing the data representation.
-
-```kotlin
-interface Tensor<T : DType, V> {
-    val data: TensorData<T, V>
-    val ops: TensorOps
-    val dtype: KClass<T>
-    val shape: Shape
-}
-```
-
-## 2. Type-Safe Tensor Creation (DSL)
-
-SKaiNET provides a powerful Type-Safe DSL for tensor creation. It ensures that the data provided matches the specified `DType` at compile-time (or through the DSL's internal validation).
-
-### Creation with `ExecutionContext`
-
-Tensors are always created within an `ExecutionContext`, which provides the necessary `TensorOps` and `TensorDataFactory`.
-
-```kotlin
-// Basic creation
-val zeros = ctx.zeros(Shape(2, 3), FP32::class)
-val ones = ctx.ones(Shape(1, 10), Int32::class)
-val full = ctx.full(Shape(5, 5), FP32::class, 42.0f)
-```
-
-### Expressive Tensor DSL
-
-For more complex initializations, use the `tensor` DSL:
-
-```kotlin
-val myTensor = tensor(ctx, FP32::class) {
-    shape(2, 2) { 
-        from(1.0f, 2.0f, 3.0f, 4.0f) 
-    }
-}
-
-val randomTensor = tensor(ctx, FP32::class) {
-    shape(10, 10) { 
-        randn(mean = 0f, std = 1f) 
-    }
-}
-
-val customInit = tensor(ctx, Int32::class) {
-    shape(5, 5) {
-        init { indices -> indices[0] + indices[1] }
-    }
-}
-```
-
-## 3. Slicing DSL API
-
-SKaiNET offers a sophisticated Slicing DSL that allows for creating views or copies of tensor segments with high precision and readability.
-
-### `sliceView` vs `sliceCopy`
-
-- **`sliceView`**: Creates a `TensorView`, which is a window into the original data (no data copying).
-- **`sliceCopy`**: Creates a new `Tensor` with a copy of the sliced data.
-
-### Slicing DSL Syntax
-
-The `SegmentBuilder` provides several ways to define slices for each dimension:
-
-- `range(start, end)`: A range of indices.
-- `at(index)`: A single index (reduces rank).
-- `all()`: All elements in that dimension (equivalent to `:` in NumPy).
-- `step(start, end, step)`: Strided access.
-- `+all()`: Short-hand for `all()`.
-
-```kotlin
-val source = ctx.ones(Shape(10, 20, 30), FP32::class)
-
-// Slicing: [0:5, 10, :]
-val view = source.sliceView {
-    segment { range(0, 5) } // Dim 0
-    segment { at(10) }      // Dim 1
-    segment { all() }       // Dim 2
-}
-```
-
-## 4. Core Operations (`TensorOps`)
-
-All mathematical operations are dispatched through the `TensorOps` interface. SKaiNET supports:
-
-- **Element-wise Ops**: `add`, `subtract`, `multiply`, `divide` (and scalar versions).
-- **Linear Algebra**: `matmul`, `transpose`.
-- **Neural Network Ops**: `conv2d`, `maxPool2d`, `relu`, `softmax`, `sigmoid`, `gelu`.
-- **Reductions**: `sum`, `mean`, `variance`.
-- **Shape Ops**: `reshape`, `flatten`, `concat`, `squeeze`, `unsqueeze`.
-
-### Operator Overloading
-
-When a tensor is "bound" to ops (e.g., via `OpsBoundTensor`), you can use standard Kotlin operators:
-
-```kotlin
-val c = a + b  // Calls ops.add(a, b)
-val d = a * 10 // Calls ops.mulScalar(a, 10)
-```
-
-## 5. Summary Table: SKaiNET vs NumPy
-
-| Feature | NumPy | SKaiNET |
-| :--- | :--- | :--- |
-| **Primary Type** | `ndarray` | `Tensor<T, V>` |
-| **Creation** | `np.array([1, 2, 3])` | `tensor(ctx, FP32::class) { shape(3) { from(1f, 2f, 3f) } }` |
-| **Zeros** | `np.zeros((2, 2))` | `ctx.zeros(Shape(2, 2), FP32::class)` |
-| **Slicing** | `a[0:5, :]` | `a.sliceView { segment { range(0, 5) }; segment { all() } }` |
-| **Matmul** | `a @ b` or `np.matmul(a, b)` | `ctx.ops.matmul(a, b)` |
-| **Reshape** | `a.reshape(new_shape)` | `ctx.ops.reshape(a, Shape(new_shape))` |
-
-## 6. Best Practices for AI Integration
-
-1.  **Context Awareness**: Always pass the `ExecutionContext` to functions that create or manipulate tensors.
-2.  **Type Safety**: Prefer specific `DType` classes (e.g., `FP32::class`, `Int32::class`) to avoid runtime errors.
-3.  **Views over Copies**: Use `sliceView` whenever possible to minimize memory overhead and improve performance.
-4.  **Backend Agnostic**: Write logic against the `TensorOps` interface to ensure your code runs on any supported backend.