diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 32a7ed1e..56426f25 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -87,8 +87,16 @@ jobs:
           cache-to: type=gha,mode=max
 
       - name: Build Antora site
+        # Run the container as the runner user (not root) so the
+        # files under docs/build/site/ are owned by the same user
+        # that the subsequent Gradle `bundleDokkaIntoSite` step runs
+        # as. Without this the Copy task fails with
+        # "Failed to create directory docs/build/site/api" because
+        # the Antora container otherwise writes the site tree as
+        # root and Gradle running as runner can't mkdir inside it.
         run: |
           docker run --rm \
+            --user "$(id -u):$(id -g)" \
             -v "${{ github.workspace }}:/antora" \
             --workdir /antora/docs \
             skainet-antora:local \
diff --git a/docs/.docker/Dockerfile b/docs/.docker/Dockerfile
index 67c21ba6..fecaca3c 100644
--- a/docs/.docker/Dockerfile
+++ b/docs/.docker/Dockerfile
@@ -1,8 +1,8 @@
 FROM node:20-alpine
 
 LABEL org.opencontainers.image.title="SKaiNET Antora" \
-      org.opencontainers.image.description="Antora site generator with built-in Mermaid rendering" \
-      org.opencontainers.image.source="https://github.com/SKaiNET-developers/SKaiNET-transformers"
+      org.opencontainers.image.description="Antora site generator with direct local Mermaid rendering (no Kroki round trip)" \
+      org.opencontainers.image.source="https://github.com/SKaiNET-developers/SKaiNET"
 
 # Chromium for mermaid-cli (puppeteer)
 RUN apk add --no-cache chromium font-noto
@@ -10,25 +10,34 @@ RUN apk add --no-cache chromium font-noto
 ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser \
     PUPPETEER_SKIP_DOWNLOAD=true
 
-# Install Antora + extensions to /opt/antora (not /antora which gets volume-mounted)
+# Install Antora + mermaid-cli into /opt/antora (not /antora which gets
+# volume-mounted at run time). asciidoctor-kroki is intentionally NOT
+# installed — it depends on a Kroki HTTP server (kroki.io or local)
+# which returns 400 for large diagrams when using GET and has no
+# offline fallback. We render mermaid directly via mermaid-cli through
+# the local-mermaid-extension.js asciidoctor block processor.
 WORKDIR /opt/antora
 RUN npm init -y && npm i --save-exact \
       @antora/cli@3.1 \
       @antora/site-generator@3.1 \
-      asciidoctor-kroki@0.18 \
       @mermaid-js/mermaid-cli@11 \
     && npm cache clean --force
 
 # Make installed modules visible when workdir is the mounted project
 ENV NODE_PATH=/opt/antora/node_modules
 
-# Mermaid-cli config
+# Mermaid-cli config — used by the local-mermaid-extension to drive
+# Puppeteer against the pre-installed Alpine Chromium.
 RUN echo '{ \
   "executablePath": "/usr/bin/chromium-browser", \
   "args": ["--no-sandbox", "--disable-gpu", "--disable-dev-shm-usage"] \
 }' > /opt/antora/puppeteer-config.json
 
-# Verify mermaid works
+# Bake the local mermaid extension in at an absolute path so the
+# Antora playbook can reference it without any volume-mount gymnastics.
+COPY local-mermaid-extension.js /opt/antora/local-mermaid-extension.js
+
+# Verify mermaid-cli works end to end at image build time.
 RUN echo 'graph TD; A-->B;' > /tmp/test.mmd \
     && npx mmdc -i /tmp/test.mmd -o /tmp/test.svg -p /opt/antora/puppeteer-config.json \
     && rm /tmp/test.mmd /tmp/test.svg
diff --git a/docs/.docker/local-mermaid-extension.js b/docs/.docker/local-mermaid-extension.js
new file mode 100644
index 00000000..35b4c776
--- /dev/null
+++ b/docs/.docker/local-mermaid-extension.js
@@ -0,0 +1,91 @@
+'use strict'
+
+/*
+ * Local mermaid block processor for Asciidoctor.js.
+ *
+ * Replaces the asciidoctor-kroki dependency on kroki.io (and its
+ * GET URL length limit / 400 rejections on large diagrams) with a
+ * direct invocation of `mmdc` — the @mermaid-js/mermaid-cli binary
+ * that the SKaiNET Antora Docker image already bakes in for its
+ * Chromium-backed Puppeteer rendering path.
+ *
+ * The extension is registered via the Antora playbook's
+ * `asciidoc.extensions` list and gets passed the Asciidoctor.js
+ * `registry` object. For every `[mermaid]\n----\n...\n----` block
+ * in any page, we:
+ *
+ *   1. write the source to a temp file
+ *   2. exec `mmdc -i in.mmd -o out.svg -p puppeteer-config.json`
+ *      (synchronous — Antora processes one page at a time and the
+ *      mermaid-cli call is fast enough that sync is fine)
+ *   3. read the produced SVG
+ *   4. inline it via a `pass` block so Asciidoctor emits the raw
+ *      SVG markup straight into the HTML output
+ *
+ * On render failure we fall back to a literal block containing
+ * the original source plus the error message, matching the
+ * degradation mode asciidoctor-kroki uses.
+ */
+
+const { execSync } = require('child_process')
+const { mkdtempSync, writeFileSync, readFileSync, rmSync } = require('fs')
+const { tmpdir } = require('os')
+const { join } = require('path')
+
+// Absolute paths baked into /opt/antora at image build time.
+// These have to match the Dockerfile that installs mermaid-cli and
+// writes the puppeteer config.
+const MMDC_BIN = '/opt/antora/node_modules/.bin/mmdc'
+const PUPPETEER_CONFIG = '/opt/antora/puppeteer-config.json'
+
+function renderMermaidToSvg (source) {
+  const dir = mkdtempSync(join(tmpdir(), 'skainet-mm-'))
+  const inputPath = join(dir, 'in.mmd')
+  const outputPath = join(dir, 'out.svg')
+  writeFileSync(inputPath, source, 'utf8')
+  try {
+    execSync(
+      `${MMDC_BIN} -i ${inputPath} -o ${outputPath} -p ${PUPPETEER_CONFIG} --quiet`,
+      { stdio: ['ignore', 'ignore', 'pipe'] }
+    )
+    return readFileSync(outputPath, 'utf8')
+  } finally {
+    try { rmSync(dir, { recursive: true, force: true }) } catch (_) { /* noop */ }
+  }
+}
+
+function mermaidBlockFactory () {
+  return function () {
+    const self = this
+    self.named('mermaid')
+    self.onContext(['listing', 'literal'])
+    self.process((parent, reader, attrs) => {
+      const source = reader.$read()
+      try {
+        const svg = renderMermaidToSvg(source)
+        return self.createBlock(parent, 'pass', svg, attrs)
+      } catch (err) {
+        const logger = parent.getDocument().getLogger()
+        logger.warn(`local-mermaid-extension: failed to render block — ${err.message}`)
+        const role = attrs.role
+        attrs.role = role ? `${role} mermaid-error` : 'mermaid-error'
+        return self.createBlock(
+          parent,
+          'literal',
+          `Error rendering mermaid diagram:\n${err.message}\n\n${source}`,
+          attrs
+        )
+      }
+    })
+  }
+}
+
+module.exports.register = function register (registry) {
+  if (typeof registry.register === 'function') {
+    registry.register(function () {
+      this.block('mermaid', mermaidBlockFactory())
+    })
+  } else if (typeof registry.block === 'function') {
+    registry.block('mermaid', mermaidBlockFactory())
+  }
+}
diff --git a/docs/antora-playbook.yml b/docs/antora-playbook.yml
index 4c7b9bca..5c9493cf 100644
--- a/docs/antora-playbook.yml
+++ b/docs/antora-playbook.yml
@@ -2,6 +2,16 @@ site:
   title: SKaiNET
   start_page: skainet::index.adoc
 
+# Keep Antora's content cache inside the project tree so the
+# container can be run as a non-root user (via `docker run --user
+# $(id -u):$(id -g)`). Without this, Antora defaults to
+# `$HOME/.cache/antora` which is unwritable when the container
+# process has no matching passwd entry and $HOME falls back to `/`.
+# The `.cache/` path is already gitignored via the pre-staged
+# `## antora` section in the repo root .gitignore.
+runtime:
+  cache_dir: ./.cache/antora
+
 content:
   sources:
     - url: /antora
@@ -10,12 +20,13 @@ content:
 
 asciidoc:
   extensions:
-    - asciidoctor-kroki
-  attributes:
-    # Use local mermaid-cli via Kroki (no external server needed when
-    # built with the custom Docker image in docs/.docker/Dockerfile —
-    # copied verbatim from SKaiNET-transformers).
-    kroki-fetch-diagram: true
+    # Local mermaid block processor — renders every `[mermaid]` block
+    # inline by invoking the @mermaid-js/mermaid-cli binary baked into
+    # the Docker image at /opt/antora/node_modules/.bin/mmdc. Replaces
+    # asciidoctor-kroki so builds don't depend on kroki.io at all,
+    # which eliminates the GET-URL length limit (4 KB) that was
+    # rejecting the large diagrams in hlo-getting-started.adoc.
+    - /opt/antora/local-mermaid-extension.js
 
 ui:
   bundle:
diff --git a/docs/antora.yml b/docs/antora.yml
index 05bf9566..947dcb18 100644
--- a/docs/antora.yml
+++ b/docs/antora.yml
@@ -3,3 +3,15 @@ title: SKaiNET
 version: ~
 nav:
   - modules/ROOT/nav.adoc
+
+# Component-level attributes flow to every page. Defined here so the
+# operator-design article (and any future page) can reference them
+# without each page declaring its own attributes block. If you need
+# to override a value on a per-page basis, declare it above the
+# first section heading on that page.
+asciidoc:
+  attributes:
+    framework_name: SKaiNET
+    ksp_version: 2.2.21-2.0.5
+    dokka_version: 2.1.0
+    asciidoctorj_version: 3.0.0
diff --git a/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc b/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc
index 2b74c01c..c3183e2d 100644
--- a/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc
+++ b/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc
@@ -21,7 +21,6 @@ Required flags remain:
 --enable-preview --add-modules jdk.incubator.vector
 ....
 
-[[jit--c2-improvements-mapped-to-skainet-ops]]
 ===== JIT / C2 improvements mapped to SKaiNET ops
 
 These are automatic — the JIT produces better native code for existing bytecode.
diff --git a/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc b/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc
index 102aa5ac..365a6a3b 100644
--- a/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc
+++ b/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc
@@ -1,10 +1,8 @@
-[[skainet-core-technology-tensor--data-guide]]
-== SKaiNET Core Technology: Tensor & Data Guide
+= SKaiNET Core Technology: Tensor & Data Guide
 
 This document provides technical instructions for AI agents and developers on using SKaiNET's Tensor and Data API as a modern, type-safe replacement for NDArray or Python's NumPy library.
 
-[[1-fundamental-architecture-tensor-composition]]
-=== 1. Fundamental Architecture: Tensor Composition
+== 1. Fundamental Architecture: Tensor Composition
 
 Unlike traditional libraries where a Tensor is a monolithic object, SKaiNET adopts a *compositional architecture*. A `Tensor++<++T, V++>++` is composed of two primary components:
 
@@ -24,12 +22,11 @@ interface Tensor<T : DType, V> {
 }
 ----
 
-[[2-type-safe-tensor-creation-dsl]]
-=== 2. Type-Safe Tensor Creation (DSL)
+== 2. Type-Safe Tensor Creation (DSL)
 
 SKaiNET provides a powerful Type-Safe DSL for tensor creation. It ensures that the data provided matches the specified `DType` at compile-time (or through the DSL's internal validation).
 
-==== Creation with `ExecutionContext`
+=== Creation with `ExecutionContext`
 
 Tensors are always created within an `ExecutionContext`, which provides the necessary `TensorOps` and `TensorDataFactory`.
 
@@ -41,7 +38,7 @@ val ones = ctx.ones(Shape(1, 10), Int32::class)
 val full = ctx.full(Shape(5, 5), FP32::class, 42.0f)
 ----
 
-==== Expressive Tensor DSL
+=== Expressive Tensor DSL
 
 For more complex initializations, use the `tensor` DSL:
 
@@ -66,17 +63,16 @@ val customInit = tensor(ctx, Int32::class) {
 }
 ----
 
-[[3-slicing-dsl-api]]
-=== 3. Slicing DSL API
+== 3. Slicing DSL API
 
 SKaiNET offers a sophisticated Slicing DSL that allows for creating views or copies of tensor segments with high precision and readability.
 
-==== `sliceView` vs `sliceCopy`
+=== `sliceView` vs `sliceCopy`
 
 * *`sliceView`*: Creates a `TensorView`, which is a window into the original data (no data copying).
 * *`sliceCopy`*: Creates a new `Tensor` with a copy of the sliced data.
 
-==== Slicing DSL Syntax
+=== Slicing DSL Syntax
 
 The `SegmentBuilder` provides several ways to define slices for each dimension:
 
@@ -98,8 +94,7 @@ val view = source.sliceView {
 }
 ----
 
-[[4-core-operations-tensorops]]
-=== 4. Core Operations (`TensorOps`)
+== 4. Core Operations (`TensorOps`)
 
 All mathematical operations are dispatched through the `TensorOps` interface. SKaiNET supports:
 
@@ -109,7 +104,7 @@ All mathematical operations are dispatched through the `TensorOps` interface. SK
 * *Reductions*: `sum`, `mean`, `variance`.
 * *Shape Ops*: `reshape`, `flatten`, `concat`, `squeeze`, `unsqueeze`.
 
-==== Operator Overloading
+=== Operator Overloading
 
 When a tensor is "bound" to ops (e.g., via `OpsBoundTensor`), you can use standard Kotlin operators:
 
@@ -119,8 +114,7 @@ val c = a + b  // Calls ops.add(a, b)
 val d = a * 10 // Calls ops.mulScalar(a, 10)
 ----
 
-[[5-summary-table-skainet-vs-numpy]]
-=== 5. Summary Table: SKaiNET vs NumPy
+== 5. Summary Table: SKaiNET vs NumPy
 
 [cols="<,<,<",options="header",]
 |===
@@ -133,8 +127,7 @@ val d = a * 10 // Calls ops.mulScalar(a, 10)
 |*Reshape* |`a.reshape(new++_++shape)` |`ctx.ops.reshape(a, Shape(new++_++shape))`
 |===
 
-[[6-best-practices-for-ai-integration]]
-=== 6. Best Practices for AI Integration
+== 6. Best Practices for AI Integration
 
 [arabic]
 . *Context Awareness*: Always pass the `ExecutionContext` to functions that create or manipulate tensors.
diff --git a/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc b/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc
index 7ef1165c..feb0bf13 100644
--- a/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc
+++ b/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc
@@ -1,12 +1,12 @@
-== Arduino C Code Generation
+= Arduino C Code Generation
 
 SKaiNET provides a specialized compiler backend for exporting trained neural networks to highly optimized, standalone C99 code suitable for microcontrollers like Arduino.
 
-=== Overview
+== Overview
 
 The Arduino C code generation process transforms a high-level Kotlin model into a memory-efficient C implementation. It prioritizes static memory allocation, minimal overhead, and numerical consistency with the original model.
 
-==== Codegen Pipeline
+=== Codegen Pipeline
 
 [mermaid]
 ----
@@ -21,18 +21,16 @@ graph TD
     H --> I[Generated .h/.c files]
 ----
 
-=== Technical Deep Dive
+== Technical Deep Dive
 
-[[1-tape-based-tracing]]
-==== 1. Tape-based Tracing
+=== 1. Tape-based Tracing
 
 Instead of static analysis of the Kotlin code, SKaiNET uses a dynamic tracing mechanism. When you call `exportToArduinoLibrary`, the framework executes a single forward pass of your model using a specialized `RecordingContext`.
 
 * Every operation (Dense, ReLU, etc.) is recorded onto an *Execution Tape*.
 * This approach handles Kotlin's language features (loops, conditionals) naturally, as it only records the actual operations that were executed.
 
-[[2-compute-graph-construction]]
-==== 2. Compute Graph Construction
+=== 2. Compute Graph Construction
 
 The execution tape is converted into a directed acyclic graph (DAG) called `ComputeGraph`.
 
@@ -40,12 +38,11 @@ The execution tape is converted into a directed acyclic graph (DAG) called `Comp
 * Edges represent data flow (Tensors).
 * During this phase, the compiler performs *Shape Inference* to ensure every tensor has a fixed, known size.
 
-[[3-static-memory-management]]
-==== 3. Static Memory Management
+=== 3. Static Memory Management
 
 Microcontrollers typically have very limited RAM and lack robust heap management. SKaiNET uses a *Ping-Pong Buffer Strategy* to eliminate dynamic memory allocation (`malloc`/`free`) during inference.
 
-===== Ping-Pong Buffer Strategy
+==== Ping-Pong Buffer Strategy
 
 The compiler calculates the maximum size required for any intermediate tensor in the graph and allocates exactly two static buffers of that size.
 
@@ -66,8 +63,7 @@ sequenceDiagram
 * *Buffer Reuse*: Instead of allocating space for every layer's output, buffers are reused.
 * *Direct Output Optimization*: The first layer reads from the input pointer, and the last layer writes directly to the output pointer, avoiding unnecessary copies.
 
-[[4-code-generation-emission]]
-==== 4. Code Generation (Emission)
+=== 4. Code Generation (Emission)
 
 The `CCodeGenerator` emits C99-compatible code using templates.
 
@@ -80,15 +76,14 @@ The `CCodeGenerator` emits C99-compatible code using templates.
 int model_inference(const float* input, float* output);
 ----
 
-[[5-validation]]
-==== 5. Validation
+=== 5. Validation
 
 The generator performs post-generation validation:
 
 * *Static Allocation Check*: Ensures no dynamic allocation is present in the generated source.
 * *Buffer Alternation Check*: Verifies that the ping-pong strategy is correctly implemented without data races or overwrites.
 
-=== Performance and Constraints
+== Performance and Constraints
 
 * *Floating Point*: Currently optimized for `FP32`.
 * *Supported Ops*: `Dense`, `ReLU`, `Sigmoid`, `Tanh`, `Add`, `MatMul`.
diff --git a/docs/modules/ROOT/pages/how-to/java-model-training.adoc b/docs/modules/ROOT/pages/how-to/java-model-training.adoc
index 2abf7d17..ddb82976 100644
--- a/docs/modules/ROOT/pages/how-to/java-model-training.adoc
+++ b/docs/modules/ROOT/pages/how-to/java-model-training.adoc
@@ -173,7 +173,6 @@ float loss = loop.step(inputBatch, targetBatch);
 System.out.printf("Step loss: %.4f%n", loss);
 ----
 
-[[full-training-with-train]]
 ==== Full Training with `.train()`
 
 `train()` accepts a `Supplier` that produces an `Iterator` of `(input, target)` pairs for each epoch:
@@ -194,7 +193,6 @@ System.out.printf("Trained %d epochs, final loss: %.4f%n",
 
 Each call to the supplier should return a fresh iterator over the training batches for that epoch. This allows reshuffling between epochs.
 
-[[async-training-with-trainasync]]
 ==== Async Training with `.trainAsync()`
 
 `trainAsync()` runs the training loop on a virtual thread and returns a `CompletableFuture++<++TrainingResult++>++`:
diff --git a/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc
index d7d47a92..cad26ee6 100644
--- a/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc
+++ b/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc
@@ -98,7 +98,6 @@ flowchart LR
 
 === Building Blocks
 
-[[1-hlo-converters]]
 ==== 1. HLO Converters
 
 Converters transform SKaiNET operations into StableHLO operations:
@@ -109,7 +108,6 @@ Converters transform SKaiNET operations into StableHLO operations:
 * *NeuralNetOperationsConverter*: High-level NN operations
 * *ConstantOperationsConverter*: Constant value operations
 
-[[2-type-system]]
 ==== 2. Type System
 
 HLO uses a strict type system for tensors:
@@ -123,7 +121,6 @@ Tensor<Float32, Shape4D> // Batch, Channel, Height, Width
 tensor<1x3x224x224xf32> // StableHLO representation
 ----
 
-[[3-optimization-framework]]
 ==== 3. Optimization Framework
 
 The optimization pipeline includes:
@@ -171,15 +168,15 @@ sequenceDiagram
     participant DAG as Compute Graph
     participant Conv as HLO Converter
     participant HLO as StableHLO IR
-    participant Opt as Optimizer
-    
+    participant Optimizer
+
     DSL->>DAG: rgb2GrayScaleMatMul()
     DAG->>Conv: MatMul + Transpose ops
     Conv->>HLO: stablehlo.dot_general
     Conv->>HLO: stablehlo.transpose
-    HLO->>Opt: Unoptimized IR
-    Opt->>HLO: Optimized IR
-    
+    HLO->>Optimizer: Unoptimized IR
+    Optimizer->>HLO: Optimized IR
+
     Note over Conv,HLO: Type inference:<br/>tensor<BxCxHxWxf32> → tensor<Bx1xHxWxf32>
 ----
 
diff --git a/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
index 003a6d46..becdecee 100644
--- a/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
+++ b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc
@@ -21,7 +21,6 @@ For Maven Surefire / exec-maven-plugin, add them to `++<++jvmArgs++>++`. For Gra
 
 === Maven Setup
 
-[[1-import-the-bom]]
 ==== 1. Import the BOM
 
 The `skainet-bom` manages all SKaiNET module versions so you never have to keep them in sync manually. Add it to your `++<++dependencyManagement++>++` section:
@@ -80,7 +79,6 @@ The `skainet-bom` manages all SKaiNET module versions so you never have to keep
 </project>
 ----
 
-[[2-add-more-modules-as-needed]]
 ==== 2. Add More Modules as Needed
 
 Because the BOM is imported, you can add any module without repeating the version: