diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 32a7ed1e..56426f25 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -87,8 +87,16 @@ jobs: cache-to: type=gha,mode=max - name: Build Antora site + # Run the container as the runner user (not root) so the + # files under docs/build/site/ are owned by the same user + # that the subsequent Gradle `bundleDokkaIntoSite` step runs + # as. Without this the Copy task fails with + # "Failed to create directory docs/build/site/api" because + # the Antora container otherwise writes the site tree as + # root and Gradle running as runner can't mkdir inside it. run: | docker run --rm \ + --user "$(id -u):$(id -g)" \ -v "${{ github.workspace }}:/antora" \ --workdir /antora/docs \ skainet-antora:local \ diff --git a/docs/.docker/Dockerfile b/docs/.docker/Dockerfile index 67c21ba6..fecaca3c 100644 --- a/docs/.docker/Dockerfile +++ b/docs/.docker/Dockerfile @@ -1,8 +1,8 @@ FROM node:20-alpine LABEL org.opencontainers.image.title="SKaiNET Antora" \ - org.opencontainers.image.description="Antora site generator with built-in Mermaid rendering" \ - org.opencontainers.image.source="https://github.com/SKaiNET-developers/SKaiNET-transformers" + org.opencontainers.image.description="Antora site generator with direct local Mermaid rendering (no Kroki round trip)" \ + org.opencontainers.image.source="https://github.com/SKaiNET-developers/SKaiNET" # Chromium for mermaid-cli (puppeteer) RUN apk add --no-cache chromium font-noto @@ -10,25 +10,34 @@ RUN apk add --no-cache chromium font-noto ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser \ PUPPETEER_SKIP_DOWNLOAD=true -# Install Antora + extensions to /opt/antora (not /antora which gets volume-mounted) +# Install Antora + mermaid-cli into /opt/antora (not /antora which gets +# volume-mounted at run time). asciidoctor-kroki is intentionally NOT +# installed — it depends on a Kroki HTTP server (kroki.io or local) +# which returns 400 for large diagrams when using GET and has no +# offline fallback. We render mermaid directly via mermaid-cli through +# the local-mermaid-extension.js asciidoctor block processor. WORKDIR /opt/antora RUN npm init -y && npm i --save-exact \ @antora/cli@3.1 \ @antora/site-generator@3.1 \ - asciidoctor-kroki@0.18 \ @mermaid-js/mermaid-cli@11 \ && npm cache clean --force # Make installed modules visible when workdir is the mounted project ENV NODE_PATH=/opt/antora/node_modules -# Mermaid-cli config +# Mermaid-cli config — used by the local-mermaid-extension to drive +# Puppeteer against the pre-installed Alpine Chromium. RUN echo '{ \ "executablePath": "/usr/bin/chromium-browser", \ "args": ["--no-sandbox", "--disable-gpu", "--disable-dev-shm-usage"] \ }' > /opt/antora/puppeteer-config.json -# Verify mermaid works +# Bake the local mermaid extension in at an absolute path so the +# Antora playbook can reference it without any volume-mount gymnastics. +COPY local-mermaid-extension.js /opt/antora/local-mermaid-extension.js + +# Verify mermaid-cli works end to end at image build time. RUN echo 'graph TD; A-->B;' > /tmp/test.mmd \ && npx mmdc -i /tmp/test.mmd -o /tmp/test.svg -p /opt/antora/puppeteer-config.json \ && rm /tmp/test.mmd /tmp/test.svg diff --git a/docs/.docker/local-mermaid-extension.js b/docs/.docker/local-mermaid-extension.js new file mode 100644 index 00000000..35b4c776 --- /dev/null +++ b/docs/.docker/local-mermaid-extension.js @@ -0,0 +1,91 @@ +'use strict' + +/* + * Local mermaid block processor for Asciidoctor.js. + * + * Replaces the asciidoctor-kroki dependency on kroki.io (and its + * GET URL length limit / 400 rejections on large diagrams) with a + * direct invocation of `mmdc` — the @mermaid-js/mermaid-cli binary + * that the SKaiNET Antora Docker image already bakes in for its + * Chromium-backed Puppeteer rendering path. + * + * The extension is registered via the Antora playbook's + * `asciidoc.extensions` list and gets passed the Asciidoctor.js + * `registry` object. For every `[mermaid]\n----\n...\n----` block + * in any page, we: + * + * 1. write the source to a temp file + * 2. exec `mmdc -i in.mmd -o out.svg -p puppeteer-config.json` + * (synchronous — Antora processes one page at a time and the + * mermaid-cli call is fast enough that sync is fine) + * 3. read the produced SVG + * 4. inline it via a `pass` block so Asciidoctor emits the raw + * SVG markup straight into the HTML output + * + * On render failure we fall back to a literal block containing + * the original source plus the error message, matching the + * degradation mode asciidoctor-kroki uses. + */ + +const { execSync } = require('child_process') +const { mkdtempSync, writeFileSync, readFileSync, rmSync } = require('fs') +const { tmpdir } = require('os') +const { join } = require('path') + +// Absolute paths baked into /opt/antora at image build time. +// These have to match the Dockerfile that installs mermaid-cli and +// writes the puppeteer config. +const MMDC_BIN = '/opt/antora/node_modules/.bin/mmdc' +const PUPPETEER_CONFIG = '/opt/antora/puppeteer-config.json' + +function renderMermaidToSvg (source) { + const dir = mkdtempSync(join(tmpdir(), 'skainet-mm-')) + const inputPath = join(dir, 'in.mmd') + const outputPath = join(dir, 'out.svg') + writeFileSync(inputPath, source, 'utf8') + try { + execSync( + `${MMDC_BIN} -i ${inputPath} -o ${outputPath} -p ${PUPPETEER_CONFIG} --quiet`, + { stdio: ['ignore', 'ignore', 'pipe'] } + ) + return readFileSync(outputPath, 'utf8') + } finally { + try { rmSync(dir, { recursive: true, force: true }) } catch (_) { /* noop */ } + } +} + +function mermaidBlockFactory () { + return function () { + const self = this + self.named('mermaid') + self.onContext(['listing', 'literal']) + self.process((parent, reader, attrs) => { + const source = reader.$read() + try { + const svg = renderMermaidToSvg(source) + return self.createBlock(parent, 'pass', svg, attrs) + } catch (err) { + const logger = parent.getDocument().getLogger() + logger.warn(`local-mermaid-extension: failed to render block — ${err.message}`) + const role = attrs.role + attrs.role = role ? `${role} mermaid-error` : 'mermaid-error' + return self.createBlock( + parent, + 'literal', + `Error rendering mermaid diagram:\n${err.message}\n\n${source}`, + attrs + ) + } + }) + } +} + +module.exports.register = function register (registry) { + if (typeof registry.register === 'function') { + registry.register(function () { + this.block('mermaid', mermaidBlockFactory()) + }) + } else if (typeof registry.block === 'function') { + registry.block('mermaid', mermaidBlockFactory()) + } +} diff --git a/docs/antora-playbook.yml b/docs/antora-playbook.yml index 4c7b9bca..5c9493cf 100644 --- a/docs/antora-playbook.yml +++ b/docs/antora-playbook.yml @@ -2,6 +2,16 @@ site: title: SKaiNET start_page: skainet::index.adoc +# Keep Antora's content cache inside the project tree so the +# container can be run as a non-root user (via `docker run --user +# $(id -u):$(id -g)`). Without this, Antora defaults to +# `$HOME/.cache/antora` which is unwritable when the container +# process has no matching passwd entry and $HOME falls back to `/`. +# The `.cache/` path is already gitignored via the pre-staged +# `## antora` section in the repo root .gitignore. +runtime: + cache_dir: ./.cache/antora + content: sources: - url: /antora @@ -10,12 +20,13 @@ content: asciidoc: extensions: - - asciidoctor-kroki - attributes: - # Use local mermaid-cli via Kroki (no external server needed when - # built with the custom Docker image in docs/.docker/Dockerfile — - # copied verbatim from SKaiNET-transformers). - kroki-fetch-diagram: true + # Local mermaid block processor — renders every `[mermaid]` block + # inline by invoking the @mermaid-js/mermaid-cli binary baked into + # the Docker image at /opt/antora/node_modules/.bin/mmdc. Replaces + # asciidoctor-kroki so builds don't depend on kroki.io at all, + # which eliminates the GET-URL length limit (4 KB) that was + # rejecting the large diagrams in hlo-getting-started.adoc. + - /opt/antora/local-mermaid-extension.js ui: bundle: diff --git a/docs/antora.yml b/docs/antora.yml index 05bf9566..947dcb18 100644 --- a/docs/antora.yml +++ b/docs/antora.yml @@ -3,3 +3,15 @@ title: SKaiNET version: ~ nav: - modules/ROOT/nav.adoc + +# Component-level attributes flow to every page. Defined here so the +# operator-design article (and any future page) can reference them +# without each page declaring its own attributes block. If you need +# to override a value on a per-page basis, declare it above the +# first section heading on that page. +asciidoc: + attributes: + framework_name: SKaiNET + ksp_version: 2.2.21-2.0.5 + dokka_version: 2.1.0 + asciidoctorj_version: 3.0.0 diff --git a/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc b/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc index 2b74c01c..c3183e2d 100644 --- a/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc +++ b/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc @@ -21,7 +21,6 @@ Required flags remain: --enable-preview --add-modules jdk.incubator.vector .... -[[jit--c2-improvements-mapped-to-skainet-ops]] ===== JIT / C2 improvements mapped to SKaiNET ops These are automatic — the JIT produces better native code for existing bytecode. diff --git a/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc b/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc index 102aa5ac..365a6a3b 100644 --- a/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc +++ b/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc @@ -1,10 +1,8 @@ -[[skainet-core-technology-tensor--data-guide]] -== SKaiNET Core Technology: Tensor & Data Guide += SKaiNET Core Technology: Tensor & Data Guide This document provides technical instructions for AI agents and developers on using SKaiNET's Tensor and Data API as a modern, type-safe replacement for NDArray or Python's NumPy library. -[[1-fundamental-architecture-tensor-composition]] -=== 1. Fundamental Architecture: Tensor Composition +== 1. Fundamental Architecture: Tensor Composition Unlike traditional libraries where a Tensor is a monolithic object, SKaiNET adopts a *compositional architecture*. A `Tensor++<++T, V++>++` is composed of two primary components: @@ -24,12 +22,11 @@ interface Tensor { } ---- -[[2-type-safe-tensor-creation-dsl]] -=== 2. Type-Safe Tensor Creation (DSL) +== 2. Type-Safe Tensor Creation (DSL) SKaiNET provides a powerful Type-Safe DSL for tensor creation. It ensures that the data provided matches the specified `DType` at compile-time (or through the DSL's internal validation). -==== Creation with `ExecutionContext` +=== Creation with `ExecutionContext` Tensors are always created within an `ExecutionContext`, which provides the necessary `TensorOps` and `TensorDataFactory`. @@ -41,7 +38,7 @@ val ones = ctx.ones(Shape(1, 10), Int32::class) val full = ctx.full(Shape(5, 5), FP32::class, 42.0f) ---- -==== Expressive Tensor DSL +=== Expressive Tensor DSL For more complex initializations, use the `tensor` DSL: @@ -66,17 +63,16 @@ val customInit = tensor(ctx, Int32::class) { } ---- -[[3-slicing-dsl-api]] -=== 3. Slicing DSL API +== 3. Slicing DSL API SKaiNET offers a sophisticated Slicing DSL that allows for creating views or copies of tensor segments with high precision and readability. -==== `sliceView` vs `sliceCopy` +=== `sliceView` vs `sliceCopy` * *`sliceView`*: Creates a `TensorView`, which is a window into the original data (no data copying). * *`sliceCopy`*: Creates a new `Tensor` with a copy of the sliced data. -==== Slicing DSL Syntax +=== Slicing DSL Syntax The `SegmentBuilder` provides several ways to define slices for each dimension: @@ -98,8 +94,7 @@ val view = source.sliceView { } ---- -[[4-core-operations-tensorops]] -=== 4. Core Operations (`TensorOps`) +== 4. Core Operations (`TensorOps`) All mathematical operations are dispatched through the `TensorOps` interface. SKaiNET supports: @@ -109,7 +104,7 @@ All mathematical operations are dispatched through the `TensorOps` interface. SK * *Reductions*: `sum`, `mean`, `variance`. * *Shape Ops*: `reshape`, `flatten`, `concat`, `squeeze`, `unsqueeze`. -==== Operator Overloading +=== Operator Overloading When a tensor is "bound" to ops (e.g., via `OpsBoundTensor`), you can use standard Kotlin operators: @@ -119,8 +114,7 @@ val c = a + b // Calls ops.add(a, b) val d = a * 10 // Calls ops.mulScalar(a, 10) ---- -[[5-summary-table-skainet-vs-numpy]] -=== 5. Summary Table: SKaiNET vs NumPy +== 5. Summary Table: SKaiNET vs NumPy [cols="<,<,<",options="header",] |=== @@ -133,8 +127,7 @@ val d = a * 10 // Calls ops.mulScalar(a, 10) |*Reshape* |`a.reshape(new++_++shape)` |`ctx.ops.reshape(a, Shape(new++_++shape))` |=== -[[6-best-practices-for-ai-integration]] -=== 6. Best Practices for AI Integration +== 6. Best Practices for AI Integration [arabic] . *Context Awareness*: Always pass the `ExecutionContext` to functions that create or manipulate tensors. diff --git a/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc b/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc index 7ef1165c..feb0bf13 100644 --- a/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc +++ b/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc @@ -1,12 +1,12 @@ -== Arduino C Code Generation += Arduino C Code Generation SKaiNET provides a specialized compiler backend for exporting trained neural networks to highly optimized, standalone C99 code suitable for microcontrollers like Arduino. -=== Overview +== Overview The Arduino C code generation process transforms a high-level Kotlin model into a memory-efficient C implementation. It prioritizes static memory allocation, minimal overhead, and numerical consistency with the original model. -==== Codegen Pipeline +=== Codegen Pipeline [mermaid] ---- @@ -21,18 +21,16 @@ graph TD H --> I[Generated .h/.c files] ---- -=== Technical Deep Dive +== Technical Deep Dive -[[1-tape-based-tracing]] -==== 1. Tape-based Tracing +=== 1. Tape-based Tracing Instead of static analysis of the Kotlin code, SKaiNET uses a dynamic tracing mechanism. When you call `exportToArduinoLibrary`, the framework executes a single forward pass of your model using a specialized `RecordingContext`. * Every operation (Dense, ReLU, etc.) is recorded onto an *Execution Tape*. * This approach handles Kotlin's language features (loops, conditionals) naturally, as it only records the actual operations that were executed. -[[2-compute-graph-construction]] -==== 2. Compute Graph Construction +=== 2. Compute Graph Construction The execution tape is converted into a directed acyclic graph (DAG) called `ComputeGraph`. @@ -40,12 +38,11 @@ The execution tape is converted into a directed acyclic graph (DAG) called `Comp * Edges represent data flow (Tensors). * During this phase, the compiler performs *Shape Inference* to ensure every tensor has a fixed, known size. -[[3-static-memory-management]] -==== 3. Static Memory Management +=== 3. Static Memory Management Microcontrollers typically have very limited RAM and lack robust heap management. SKaiNET uses a *Ping-Pong Buffer Strategy* to eliminate dynamic memory allocation (`malloc`/`free`) during inference. -===== Ping-Pong Buffer Strategy +==== Ping-Pong Buffer Strategy The compiler calculates the maximum size required for any intermediate tensor in the graph and allocates exactly two static buffers of that size. @@ -66,8 +63,7 @@ sequenceDiagram * *Buffer Reuse*: Instead of allocating space for every layer's output, buffers are reused. * *Direct Output Optimization*: The first layer reads from the input pointer, and the last layer writes directly to the output pointer, avoiding unnecessary copies. -[[4-code-generation-emission]] -==== 4. Code Generation (Emission) +=== 4. Code Generation (Emission) The `CCodeGenerator` emits C99-compatible code using templates. @@ -80,15 +76,14 @@ The `CCodeGenerator` emits C99-compatible code using templates. int model_inference(const float* input, float* output); ---- -[[5-validation]] -==== 5. Validation +=== 5. Validation The generator performs post-generation validation: * *Static Allocation Check*: Ensures no dynamic allocation is present in the generated source. * *Buffer Alternation Check*: Verifies that the ping-pong strategy is correctly implemented without data races or overwrites. -=== Performance and Constraints +== Performance and Constraints * *Floating Point*: Currently optimized for `FP32`. * *Supported Ops*: `Dense`, `ReLU`, `Sigmoid`, `Tanh`, `Add`, `MatMul`. diff --git a/docs/modules/ROOT/pages/how-to/java-model-training.adoc b/docs/modules/ROOT/pages/how-to/java-model-training.adoc index 2abf7d17..ddb82976 100644 --- a/docs/modules/ROOT/pages/how-to/java-model-training.adoc +++ b/docs/modules/ROOT/pages/how-to/java-model-training.adoc @@ -173,7 +173,6 @@ float loss = loop.step(inputBatch, targetBatch); System.out.printf("Step loss: %.4f%n", loss); ---- -[[full-training-with-train]] ==== Full Training with `.train()` `train()` accepts a `Supplier` that produces an `Iterator` of `(input, target)` pairs for each epoch: @@ -194,7 +193,6 @@ System.out.printf("Trained %d epochs, final loss: %.4f%n", Each call to the supplier should return a fresh iterator over the training batches for that epoch. This allows reshuffling between epochs. -[[async-training-with-trainasync]] ==== Async Training with `.trainAsync()` `trainAsync()` runs the training loop on a virtual thread and returns a `CompletableFuture++<++TrainingResult++>++`: diff --git a/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc index d7d47a92..cad26ee6 100644 --- a/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc +++ b/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc @@ -98,7 +98,6 @@ flowchart LR === Building Blocks -[[1-hlo-converters]] ==== 1. HLO Converters Converters transform SKaiNET operations into StableHLO operations: @@ -109,7 +108,6 @@ Converters transform SKaiNET operations into StableHLO operations: * *NeuralNetOperationsConverter*: High-level NN operations * *ConstantOperationsConverter*: Constant value operations -[[2-type-system]] ==== 2. Type System HLO uses a strict type system for tensors: @@ -123,7 +121,6 @@ Tensor // Batch, Channel, Height, Width tensor<1x3x224x224xf32> // StableHLO representation ---- -[[3-optimization-framework]] ==== 3. Optimization Framework The optimization pipeline includes: @@ -171,15 +168,15 @@ sequenceDiagram participant DAG as Compute Graph participant Conv as HLO Converter participant HLO as StableHLO IR - participant Opt as Optimizer - + participant Optimizer + DSL->>DAG: rgb2GrayScaleMatMul() DAG->>Conv: MatMul + Transpose ops Conv->>HLO: stablehlo.dot_general Conv->>HLO: stablehlo.transpose - HLO->>Opt: Unoptimized IR - Opt->>HLO: Optimized IR - + HLO->>Optimizer: Unoptimized IR + Optimizer->>HLO: Optimized IR + Note over Conv,HLO: Type inference:
tensor → tensor ---- diff --git a/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc index 003a6d46..becdecee 100644 --- a/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc +++ b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc @@ -21,7 +21,6 @@ For Maven Surefire / exec-maven-plugin, add them to `++<++jvmArgs++>++`. For Gra === Maven Setup -[[1-import-the-bom]] ==== 1. Import the BOM The `skainet-bom` manages all SKaiNET module versions so you never have to keep them in sync manually. Add it to your `++<++dependencyManagement++>++` section: @@ -80,7 +79,6 @@ The `skainet-bom` manages all SKaiNET module versions so you never have to keep ---- -[[2-add-more-modules-as-needed]] ==== 2. Add More Modules as Needed Because the BOM is imported, you can add any module without repeating the version: