diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000..32a7ed1e --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,122 @@ +name: Docs + +# Build the Antora site (with generated operator pages and the +# cross-backend coverage matrix) on every PR, and publish to GitHub +# Pages on pushes to develop. Dokka API bundling is wired in +# commit 6 of the docs-to-Antora migration (see issue #494). + +on: + push: + branches: [ main, develop ] + paths: + - 'docs/**' + - '.github/workflows/docs.yml' + - 'build.gradle.kts' + - 'build-logic/**' + - 'skainet-lang/skainet-lang-core/**' + pull_request: + paths: + - 'docs/**' + - '.github/workflows/docs.yml' + - 'build.gradle.kts' + - 'build-logic/**' + - 'skainet-lang/skainet-lang-core/**' + workflow_dispatch: + +concurrency: + group: docs-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + pages: write + id-token: write + +jobs: + build-docs: + runs-on: ubuntu-latest + timeout-minutes: 30 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + # JDK 25 matches the version used by every other workflow in + # this repo. Runs on the RUNNER, not inside the Docker + # container, so the Gradle wrapper cache works and generateDocs + # / dokkaGenerate see the right JDK. + - name: Set up JDK + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: '25' + + - name: Cache Gradle + uses: actions/cache@v4 + with: + path: | + ~/.gradle/caches + ~/.gradle/wrapper + key: gradle-${{ runner.os }}-${{ hashFiles('**/*.gradle*', '**/gradle-wrapper.properties', '**/libs.versions.toml') }} + restore-keys: | + gradle-${{ runner.os }}- + + # Emit the KSP-driven operator fragments and the coverage + # matrix into docs/modules/ROOT/pages/reference/operators/. + # Also generate the full Dokka API aggregate so commit 6 can + # bundle it; running both here means commit 6 is a pure + # workflow-step + Gradle-task-registration change with no + # Gradle re-run cost. + - name: Generate operator docs and Dokka + run: ./gradlew --no-daemon generateDocs dokkaGenerate + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + # The Chromium layer makes the image ~400 MB. First build is + # ~3–5 minutes; subsequent runs are sub-minute via the GHA + # cache. Transformers skipped caching here β€” this workflow + # improves on that. + - name: Build Antora image + uses: docker/build-push-action@v5 + with: + context: docs/.docker + tags: skainet-antora:local + load: true + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Build Antora site + run: | + docker run --rm \ + -v "${{ github.workspace }}:/antora" \ + --workdir /antora/docs \ + skainet-antora:local \ + --stacktrace \ + antora-playbook.yml + + # Bundle Dokka HTML under a sibling `/api/` path of the + # Antora site. Must run AFTER Antora has populated + # docs/build/site/, never before β€” bundleDokkaIntoSite is a + # plain Copy task that would otherwise pre-create the target + # directory and the later Antora run would wipe it. + - name: Bundle Dokka API into site + run: ./gradlew --no-daemon bundleDokkaIntoSite + + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: docs/build/site + + deploy-docs: + if: github.ref == 'refs/heads/develop' && github.event_name == 'push' + needs: build-docs + runs-on: ubuntu-latest + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/dokka-pages.yml b/.github/workflows/dokka-pages.yml deleted file mode 100644 index ec20dd17..00000000 --- a/.github/workflows/dokka-pages.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: Dokka API Docs β†’ GitHub Pages - -on: - push: - branches: [ main, feature/14-dokka ] - workflow_dispatch: - -permissions: - contents: read - pages: write - id-token: write - -concurrency: - group: pages - cancel-in-progress: false - -jobs: - build: - runs-on: ubuntu-latest - timeout-minutes: 60 - - steps: - - name: Checkout - uses: actions/checkout@v6 - - - name: Copy CI gradle.properties - run: mkdir -p ~/.gradle ; cp .github/ci-gradle.properties ~/.gradle/gradle.properties - - - name: Set up JDK 25 - uses: actions/setup-java@v5 - with: - distribution: 'zulu' - java-version: 25 - - - name: Setup Gradle - uses: gradle/actions/setup-gradle@v6 - - - name: Generate Dokka HTML - run: ./gradlew dokkaGenerate --no-daemon - - - name: Upload Pages artifact - uses: actions/upload-pages-artifact@v4 - with: - path: build/dokka/html - - deploy: - needs: build - runs-on: ubuntu-latest - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - - steps: - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v5 diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 564b49fc..aecc5c0f 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -1,4 +1,4 @@ -# πŸ—οΈ Architecture -SKaiNET uses a hybrid backend strategy that separates development iteration from production deployment. +# Architecture -![Architecture diagram of SKaiNET compiler](docs/SKaiNET-compiler.svg) +See the published site: +https://skainet-developers.github.io/SKaiNET/skainet/reference/architecture.html diff --git a/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt b/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt index 0d29fcfe..845c8e2f 100644 --- a/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt +++ b/build-logic/convention/src/main/kotlin/GenerateDocumentationTask.kt @@ -65,14 +65,135 @@ abstract class GenerateDocumentationTask : DefaultTask() { private fun generateAsciidoc(module: OperatorDocModule, outputDir: File) { outputDir.mkdirs() - + if (generateIndex.getOrElse(true)) { generateMainIndex(module, outputDir) } - + module.operators.forEach { operator -> generateOperatorPage(operator, module, outputDir) } + + // Sibling cross-backend coverage matrix. Lives one level above + // the per-operator pages so a single URL gives the whole + // picture. Skipped when includeBackendStatus is disabled. + if (includeBackendStatus.getOrElse(true)) { + emitOpsStatusMatrix(module, outputDir) + } + } + + /** + * Emit a single-page `ops-status-matrix.adoc` with rows of + * operator.function pairs and columns of every backend that + * appears in any function's `statusByBackend` map. Cells carry + * the status emoji; a totals footer shows how many functions + * each backend supports out of the total. + * + * Written to [outputDir].parentFile.parentFile so that, under the + * Antora `reference/operators/generated/` layout, the matrix + * lands at `reference/ops-status-matrix.adoc` β€” one navigable + * click away from the operator index and with a stable URL. + * Falls back to writing next to [outputDir] when the path + * doesn't have the expected depth (flat layouts). + */ + private fun emitOpsStatusMatrix(module: OperatorDocModule, outputDir: File) { + val matrixDir = outputDir.parentFile?.parentFile ?: outputDir + matrixDir.mkdirs() + val matrixFile = File(matrixDir, "ops-status-matrix.adoc") + + // Collect every backend that appears anywhere, sorted so the + // column order is stable across runs. + val allBackends: List = module.operators + .flatMap { op -> op.functions.flatMap { it.statusByBackend.keys } } + .toSortedSet() + .toList() + + // Row view: (operator, function) pair -> per-backend status. + data class Row(val operator: String, val function: String, val status: Map) + val rows: List = module.operators.flatMap { op -> + op.functions.map { fn -> Row(op.name, fn.name, fn.statusByBackend) } + } + + matrixFile.writeText(buildString { + appendLine("= Operator Coverage Matrix") + appendLine(":description: Cross-backend status for every operator function in SKaiNET.") + appendLine("") + appendLine("Generated from `operators.json` version `${module.version}` on ${formatTimestamp(module.timestamp)}.") + appendLine("") + appendLine("Rows are `Operator.function` pairs; columns are backends that appear in any function's `statusByBackend` map. A missing entry means the backend makes no claim about the function β€” treat it as \"unknown\", not \"not supported\".") + appendLine("") + if (rows.isEmpty() || allBackends.isEmpty()) { + appendLine("NOTE: No backend status information found in the source data.") + appendLine("") + return@buildString + } + + // Table header: 1 col for the row label + 1 col per backend. + val colSpec = (listOf("2") + List(allBackends.size) { "1" }).joinToString(",") + appendLine("[cols=\"$colSpec\", options=\"header\"]") + appendLine("|===") + append("| Operator.function ") + allBackends.forEach { append("| $it ") } + appendLine("") + appendLine("") + + rows.forEach { row -> + append("| `${row.operator}.${row.function}` ") + allBackends.forEach { backend -> + val raw = row.status[backend] + val cell = if (raw == null) "β€”" else shortStatus(raw) + append("| $cell ") + } + appendLine("") + } + + // Totals footer: number of "done" rows per backend out + // of total row count. A status counts as done when it + // maps to the green check in shortStatus. + appendLine("") + append("| *Done* ") + allBackends.forEach { backend -> + val n = rows.count { isDone(it.status[backend]) } + append("| *$n / ${rows.size}* ") + } + appendLine("") + appendLine("|===") + appendLine("") + appendLine("Per-function detail including notes lives in xref:reference/operators/generated/index.adoc[Operator reference].") + }) + } + + /** + * Short emoji-only rendering of a backend status, for use in the + * compact matrix cells. The long-form wording stays on the + * per-function backend-status table produced by + * [generateBackendStatusTable]. + * + * The vocabulary covers both the planning-style strings + * (`supported` / `partial` / `not_supported` / `planned`) and + * the implementation-style strings the KSP processor actually + * emits today (`implemented` / `in_progress` / `missing`). + * Unknown values fall back to the raw string so the matrix + * never silently hides a status the generator didn't anticipate. + */ + private fun shortStatus(status: String): String = when (status.lowercase()) { + "supported", "implemented", "done" -> "βœ…" + "partial" -> "⚠️" + "not_supported", "missing", "unsupported" -> "❌" + "planned" -> "⏳" + "in_progress", "wip" -> "🚧" + else -> status + } + + /** + * Whether a status string counts toward the totals footer in + * the ops-status matrix. Mirrors the "green check" branch of + * [shortStatus] β€” any status rendered with βœ… is counted as + * done. + */ + private fun isDone(status: String?): Boolean = when (status?.lowercase()) { + "supported", "implemented", "done" -> true + else -> false } private fun generateMarkdown(module: OperatorDocModule, outputDir: File) { @@ -87,6 +208,14 @@ abstract class GenerateDocumentationTask : DefaultTask() { private fun generateMainIndex(module: OperatorDocModule, outputDir: File) { val indexFile = File(outputDir, "index.adoc") + // When the output directory sits under an Antora module's + // `modules//pages/` tree, xrefs in the emitted index + // must be resolved relative to that `pages/` root, not the + // current file. Auto-derive the prefix from the output path + // so the generator works both with Antora and with flat doc + // layouts (empty prefix -> bare filenames, the original + // behavior). + val xrefPrefix = deriveAntoraXrefPrefix(outputDir) indexFile.writeText(buildString { appendLine("= AI-NET Operators Reference") appendLine("") @@ -94,18 +223,41 @@ abstract class GenerateDocumentationTask : DefaultTask() { appendLine("") appendLine("== Operators by Modality") appendLine("") - + val operatorsByModality = module.operators.groupBy { it.modality } operatorsByModality.forEach { (modality, operators) -> appendLine("=== ${modality.capitalize()}") appendLine("") operators.forEach { operator -> - appendLine("* xref:${operator.name.lowercase()}.adoc[${operator.name}]") + appendLine("* xref:$xrefPrefix${operator.name.lowercase()}.adoc[${operator.name}]") } appendLine("") } }) } + + /** + * If [outputDir] lives under an Antora `modules//pages/...` + * tree, return the path segment from `pages/` down to the output + * directory, suffixed with `/`. Otherwise return an empty string, + * so the generator emits bare-filename xrefs (the pre-Antora + * behavior). + * + * Example: + * ``` + * /repo/docs/modules/ROOT/pages/reference/operators/generated + * β†’ "reference/operators/generated/" + * /repo/docs/operators/generated β†’ "" + * ``` + */ + private fun deriveAntoraXrefPrefix(outputDir: File): String { + val path = outputDir.absolutePath.replace(File.separatorChar, '/') + val marker = "/pages/" + val idx = path.indexOf(marker) + if (idx < 0) return "" + val tail = path.substring(idx + marker.length) + return if (tail.isEmpty()) "" else "$tail/" + } private fun generateOperatorPage(operator: OperatorDoc, module: OperatorDocModule, outputDir: File) { val operatorFile = File(outputDir, "${operator.name.lowercase()}.adoc") diff --git a/build.gradle.kts b/build.gradle.kts index 27e43398..0df84bb7 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -100,10 +100,12 @@ tasks.register("generateOperatorDocs") { } } -// Documentation plugin configuration +// Documentation plugin configuration β€” emits operator doc fragments +// into the Antora ROOT module so the published site can surface them +// under Reference > Operator coverage. documentation { inputFile.set(file("skainet-lang/skainet-lang-core/build/generated/ksp/metadata/commonMain/resources/operators.json")) - outputDirectory.set(file("docs/modules/operators/_generated_")) + outputDirectory.set(file("docs/modules/ROOT/pages/reference/operators/generated")) includeBackendStatus.set(true) generateIndex.set(true) } @@ -153,4 +155,17 @@ dependencies { // Other dokka(project(":skainet-pipeline")) dokka(project(":skainet-models:skainet-model-yolo")) +} + +// Copy the Dokka-generated HTML aggregate into the Antora site +// output as a sibling `/api/` path. Invoked by .github/workflows/docs.yml +// AFTER Antora has populated `docs/build/site/`; intentionally NOT +// wired into the `build` lifecycle so that running `./gradlew build` +// locally never silently creates a half-populated site directory. +tasks.register("bundleDokkaIntoSite") { + group = "documentation" + description = "Copy build/dokka/html into docs/build/site/api for GitHub Pages publish" + dependsOn("dokkaGenerate") + from(layout.buildDirectory.dir("dokka/html")) + into(layout.projectDirectory.dir("docs/build/site/api")) } \ No newline at end of file diff --git a/docs/.docker/Dockerfile b/docs/.docker/Dockerfile new file mode 100644 index 00000000..67c21ba6 --- /dev/null +++ b/docs/.docker/Dockerfile @@ -0,0 +1,37 @@ +FROM node:20-alpine + +LABEL org.opencontainers.image.title="SKaiNET Antora" \ + org.opencontainers.image.description="Antora site generator with built-in Mermaid rendering" \ + org.opencontainers.image.source="https://github.com/SKaiNET-developers/SKaiNET-transformers" + +# Chromium for mermaid-cli (puppeteer) +RUN apk add --no-cache chromium font-noto + +ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser \ + PUPPETEER_SKIP_DOWNLOAD=true + +# Install Antora + extensions to /opt/antora (not /antora which gets volume-mounted) +WORKDIR /opt/antora +RUN npm init -y && npm i --save-exact \ + @antora/cli@3.1 \ + @antora/site-generator@3.1 \ + asciidoctor-kroki@0.18 \ + @mermaid-js/mermaid-cli@11 \ + && npm cache clean --force + +# Make installed modules visible when workdir is the mounted project +ENV NODE_PATH=/opt/antora/node_modules + +# Mermaid-cli config +RUN echo '{ \ + "executablePath": "/usr/bin/chromium-browser", \ + "args": ["--no-sandbox", "--disable-gpu", "--disable-dev-shm-usage"] \ +}' > /opt/antora/puppeteer-config.json + +# Verify mermaid works +RUN echo 'graph TD; A-->B;' > /tmp/test.mmd \ + && npx mmdc -i /tmp/test.mmd -o /tmp/test.svg -p /opt/antora/puppeteer-config.json \ + && rm /tmp/test.mmd /tmp/test.svg + +ENTRYPOINT ["/opt/antora/node_modules/.bin/antora"] +CMD ["--stacktrace", "antora-playbook.yml"] diff --git a/docs/antora-playbook.yml b/docs/antora-playbook.yml new file mode 100644 index 00000000..4c7b9bca --- /dev/null +++ b/docs/antora-playbook.yml @@ -0,0 +1,26 @@ +site: + title: SKaiNET + start_page: skainet::index.adoc + +content: + sources: + - url: /antora + start_path: docs + branches: HEAD + +asciidoc: + extensions: + - asciidoctor-kroki + attributes: + # Use local mermaid-cli via Kroki (no external server needed when + # built with the custom Docker image in docs/.docker/Dockerfile β€” + # copied verbatim from SKaiNET-transformers). + kroki-fetch-diagram: true + +ui: + bundle: + url: https://gitlab.com/antora/antora-ui-default/-/jobs/artifacts/HEAD/raw/build/ui-bundle.zip?job=bundle-stable + snapshot: true + +output: + dir: ./build/site diff --git a/docs/antora.yml b/docs/antora.yml new file mode 100644 index 00000000..05bf9566 --- /dev/null +++ b/docs/antora.yml @@ -0,0 +1,5 @@ +name: skainet +title: SKaiNET +version: ~ +nav: + - modules/ROOT/nav.adoc diff --git a/docs/arduino-c-codegen.md b/docs/arduino-c-codegen.md deleted file mode 100644 index 5bc9eda8..00000000 --- a/docs/arduino-c-codegen.md +++ /dev/null @@ -1,75 +0,0 @@ -# Arduino C Code Generation - -SKaiNET provides a specialized compiler backend for exporting trained neural networks to highly optimized, standalone C99 code suitable for microcontrollers like Arduino. - -## Overview - -The Arduino C code generation process transforms a high-level Kotlin model into a memory-efficient C implementation. It prioritizes static memory allocation, minimal overhead, and numerical consistency with the original model. - -### Codegen Pipeline - -```mermaid -graph TD - A[Kotlin Model] --> B[Recording Pass] - B --> C[Execution Tape] - C --> D[Compute Graph] - D --> E[Graph Validation] - E --> F[Memory Layout Calculation] - F --> G[C Code Emission] - G --> H[Arduino Library Packaging] - H --> I[Generated .h/.c files] -``` - -## Technical Deep Dive - -### 1. Tape-based Tracing -Instead of static analysis of the Kotlin code, SKaiNET uses a dynamic tracing mechanism. When you call `exportToArduinoLibrary`, the framework executes a single forward pass of your model using a specialized `RecordingContext`. -- Every operation (Dense, ReLU, etc.) is recorded onto an **Execution Tape**. -- This approach handles Kotlin's language features (loops, conditionals) naturally, as it only records the actual operations that were executed. - -### 2. Compute Graph Construction -The execution tape is converted into a directed acyclic graph (DAG) called `ComputeGraph`. -- Nodes represent operations (Ops). -- Edges represent data flow (Tensors). -- During this phase, the compiler performs **Shape Inference** to ensure every tensor has a fixed, known size. - -### 3. Static Memory Management -Microcontrollers typically have very limited RAM and lack robust heap management. SKaiNET uses a **Ping-Pong Buffer Strategy** to eliminate dynamic memory allocation (`malloc`/`free`) during inference. - -#### Ping-Pong Buffer Strategy -The compiler calculates the maximum size required for any intermediate tensor in the graph and allocates exactly two static buffers of that size. - -```mermaid -sequenceDiagram - participant I as Input - participant B1 as Buffer A - participant B2 as Buffer B - participant O as Output - - I->>B1: Layer 1 (Input -> A) - B1->>B2: Layer 2 (A -> B) - B2->>B1: Layer 3 (B -> A) - B1->>O: Layer 4 (A -> Output) -``` - -- **Buffer Reuse**: Instead of allocating space for every layer's output, buffers are reused. -- **Direct Output Optimization**: The first layer reads from the input pointer, and the last layer writes directly to the output pointer, avoiding unnecessary copies. - -### 4. Code Generation (Emission) -The `CCodeGenerator` emits C99-compatible code using templates. -- **Weights & Biases**: Extracted from the trained Kotlin model and serialized as `static const float` arrays. This places them in Flash memory (PROGMEM) on many microcontrollers, saving precious RAM. -- **Kernel Implementation**: Operations like `Dense` (Linear) are implemented as optimized nested loops. -- **Header Generation**: Produces a clean API for the user: - ```c - int model_inference(const float* input, float* output); - ``` - -### 5. Validation -The generator performs post-generation validation: -- **Static Allocation Check**: Ensures no dynamic allocation is present in the generated source. -- **Buffer Alternation Check**: Verifies that the ping-pong strategy is correctly implemented without data races or overwrites. - -## Performance and Constraints -- **Floating Point**: Currently optimized for `FP32`. -- **Supported Ops**: `Dense`, `ReLU`, `Sigmoid`, `Tanh`, `Add`, `MatMul`. -- **Memory**: Total memory consumption is `TotalWeights + 2 * MaxIntermediateTensor`. diff --git a/docs/build_help.md b/docs/build_help.md deleted file mode 100644 index b6413e73..00000000 --- a/docs/build_help.md +++ /dev/null @@ -1,81 +0,0 @@ -# Build Help - -## Dokka API Documentation - -SKaiNET uses [Dokka 2.1.0](https://github.com/Kotlin/dokka) to generate API reference documentation across all public library modules. A shared convention plugin (`sk.ainet.dokka`) standardises the configuration. - -### Generating docs locally - -**Single module:** - -```bash -./gradlew :skainet-lang:skainet-lang-core:dokkaGeneratePublicationHtml -``` - -Output: `skainet-lang/skainet-lang-core/build/dokka/html/` - -**Aggregated (all modules):** - -```bash -./gradlew dokkaGenerate -``` - -Output: `build/dokka/html/index.html` - -### Convention plugin details - -The `sk.ainet.dokka` precompiled script plugin (`build-logic/convention/src/main/kotlin/sk.ainet.dokka.gradle.kts`) applies `org.jetbrains.dokka` and configures: - -- **moduleName** from `project.name` -- **moduleVersion** from the `VERSION_NAME` Gradle property -- **Documented visibilities:** public only -- **Suppressed generated files:** KSP-generated code is excluded -- **Suppressed native source sets:** `iosArm64Main`, `iosSimulatorArm64Main`, `macosArm64Main`, `linuxX64Main`, `linuxArm64Main` are suppressed because Dokka 2.x cannot translate native cinterop symbols -- **Source links** pointing to the GitHub repository - -### Modules with Dokka enabled - -The plugin is applied to 21 library modules: - -| Group | Modules | -|-------|---------| -| skainet-lang | `skainet-lang-core`, `skainet-lang-models`, `skainet-lang-ksp-annotations`, `skainet-lang-dag` | -| skainet-compile | `skainet-compile-core`, `skainet-compile-dag`, `skainet-compile-json`, `skainet-compile-hlo`, `skainet-compile-c` | -| skainet-backends | `skainet-backend-cpu` | -| skainet-data | `skainet-data-api`, `skainet-data-transform`, `skainet-data-simple`, `skainet-data-media` | -| skainet-io | `skainet-io-core`, `skainet-io-gguf`, `skainet-io-image`, `skainet-io-onnx`, `skainet-io-safetensors` | -| Other | `skainet-pipeline`, `skainet-model-yolo` | - -**Excluded:** `skainet-bom` (no source), `skainet-apps/*`, `skainet-test/*`, benchmarks, and `skainet-lang-ksp-processor` (internal). - -### Root-level aggregation - -The root `build.gradle.kts` applies the Dokka plugin directly (not `apply false`) and declares `dokka(project(...))` dependencies for all 21 modules. Running `./gradlew dokkaGenerate` at the root produces a unified API reference that includes every module under a single `SKaiNET` namespace. The root `README.md` is included as the landing page. - -### KSP interaction - -`skainet-lang-core` and `skainet-lang-dag` use KSP to generate source code. Their build files include: - -```kotlin -tasks.matching { it.name.startsWith("dokka") }.configureEach { - dependsOn("kspCommonMainKotlinMetadata") -} -``` - -This ensures KSP-generated sources are available before Dokka runs. - -### GitHub Pages deployment - -The workflow `.github/workflows/dokka-pages.yml` runs on push to `main` (and manually via `workflow_dispatch`). It: - -1. Checks out the repo -2. Sets up JDK 25 -3. Runs `./gradlew dokkaGenerate` -4. Uploads the `build/dokka/html` directory as a Pages artifact -5. Deploys to GitHub Pages using `actions/deploy-pages@v4` - -**Prerequisite:** The repository must have Pages configured to deploy from GitHub Actions (Settings > Pages > Source: "GitHub Actions"). - -### Operator docs (unchanged) - -The existing operator documentation pipeline (`./gradlew generateDocs`) is unrelated to Dokka and continues to work as before. diff --git a/docs/kllama-getting-started.md b/docs/kllama-getting-started.md deleted file mode 100644 index 7e7fb8e9..00000000 --- a/docs/kllama-getting-started.md +++ /dev/null @@ -1,22 +0,0 @@ -# KLlama Getting Started - -KLlama is a pure Kotlin LLaMA inference runtime that runs on JVM, Native, JS, and WebAssembly. It supports GGUF, SafeTensors, and Karpathy .bin model formats with on-the-fly quantization support. - -> **Early Stage Development**: The project is in active development. We appreciate your feedback and bug reports! - -## Choose Your Path - -| Goal | Guide | -|---|---| -| **Run models from the command line** | [KLlama CLI](../skainet-apps/skainet-kllama-cli/README.md) | -| **Embed in a Kotlin application** | [KLlama Library](../skainet-apps/skainet-kllama/README.md) | -| **Embed in a Java application** | [Java LLM Inference Guide](java-llm-inference.md) | -| **Build a standalone Java CLI app** | [Java CLI App Guide](java-cli-app.md) | -| **Java project setup (Maven / Gradle)** | [Java Getting Started](java-getting-started.md) | - -## Quick Links - -- [Supported formats & quantization](../skainet-apps/skainet-kllama/README.md#supported-formats--quantization) -- [Custom backend integration](../skainet-apps/skainet-kllama/README.md#custom-backend-integration) -- [Agent & tool calling](java-llm-inference.md#agent-loop-and-tool-calling) -- [BERT embeddings & similarity](java-llm-inference.md#bert-encoding-and-similarity) diff --git a/docs/SKaiNET-compiler.svg b/docs/modules/ROOT/images/SKaiNET-compiler.svg similarity index 100% rename from docs/SKaiNET-compiler.svg rename to docs/modules/ROOT/images/SKaiNET-compiler.svg diff --git a/docs/SKaiNET-logo.png b/docs/modules/ROOT/images/SKaiNET-logo.png similarity index 100% rename from docs/SKaiNET-logo.png rename to docs/modules/ROOT/images/SKaiNET-logo.png diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc new file mode 100644 index 00000000..70418739 --- /dev/null +++ b/docs/modules/ROOT/nav.adoc @@ -0,0 +1,32 @@ +* xref:index.adoc[Overview] + +.Tutorials +* xref:tutorials/java-getting-started.adoc[Java getting started] +* xref:tutorials/kllama-getting-started.adoc[KLlama getting started] +* xref:tutorials/hlo-getting-started.adoc[StableHLO getting started] +* xref:tutorials/graph-dsl.adoc[Graph DSL] + +.How-to guides +* xref:how-to/build.adoc[Build from source] +* xref:how-to/io-readers.adoc[Load models (GGUF, SafeTensors, ONNX)] +* xref:how-to/java-cli-app.adoc[Build a Java CLI app] +* xref:how-to/java-llm-inference.adoc[Run LLM inference] +* xref:how-to/java-model-training.adoc[Train a model] +* xref:how-to/arduino-c-codegen.adoc[Generate C for Arduino] + +.Reference +* xref:reference/architecture.adoc[Architecture] +* xref:reference/operators/generated/index.adoc[Operator reference] +* xref:reference/ops-status-matrix.adoc[Operator coverage matrix] +* xref:reference/api.adoc[API reference (Dokka)] + +.Explanation +* xref:explanation/skainet-for-ai.adoc[SKaiNET for AI/ML] +* xref:explanation/operator-design.adoc[Operator documentation system] +* xref:explanation/theory/index.adoc[Mathematical theory] +** xref:explanation/theory/matmul.adoc[Matrix multiplication] +* xref:explanation/examples/index.adoc[Worked examples] +** xref:explanation/examples/matmul.adoc[Matrix multiplication examples] +* xref:explanation/perf/jvm-cpu.adoc[JVM CPU performance] +* xref:explanation/perf/java-25-cpu-backend.adoc[Java 25 CPU backend notes] +* xref:explanation/issues/native-macos-accelerate-simd.adoc[Native macOS Accelerate SIMD issues] diff --git a/docs/examples/index.adoc b/docs/modules/ROOT/pages/explanation/examples/index.adoc similarity index 87% rename from docs/examples/index.adoc rename to docs/modules/ROOT/pages/explanation/examples/index.adoc index 97630bec..36946d7d 100644 --- a/docs/examples/index.adoc +++ b/docs/modules/ROOT/pages/explanation/examples/index.adoc @@ -7,7 +7,7 @@ This section contains practical examples and usage patterns for SKaiNET operator === Linear Algebra -include::matmul-examples.adoc[leveloffset=+2] +include::matmul.adoc[leveloffset=+2] === Tensor Creation and Manipulation @@ -53,5 +53,5 @@ include::matmul-examples.adoc[leveloffset=+2] [#cross-references] == Cross-References -* xref:../theory/index.adoc[Mathematical Theory] -* xref:../modules/operators/_generated_/index.adoc[Generated API Reference] \ No newline at end of file +* xref:explanation/theory/index.adoc[Mathematical Theory] +// Operator reference lands in a later commit of the Antora migration. \ No newline at end of file diff --git a/docs/examples/matmul-examples.adoc b/docs/modules/ROOT/pages/explanation/examples/matmul.adoc similarity index 100% rename from docs/examples/matmul-examples.adoc rename to docs/modules/ROOT/pages/explanation/examples/matmul.adoc diff --git a/docs/issues/native-macos-accelerate-simd.md b/docs/modules/ROOT/pages/explanation/issues/native-macos-accelerate-simd.adoc similarity index 51% rename from docs/issues/native-macos-accelerate-simd.md rename to docs/modules/ROOT/pages/explanation/issues/native-macos-accelerate-simd.adoc index b0317c92..4fa01b33 100644 --- a/docs/issues/native-macos-accelerate-simd.md +++ b/docs/modules/ROOT/pages/explanation/issues/native-macos-accelerate-simd.adoc @@ -1,6 +1,6 @@ -# Native macOS SIMD acceleration via Apple Accelerate framework +== Native macOS SIMD acceleration via Apple Accelerate framework -## Problem +=== Problem The `skainet-backend-cpu` module on Kotlin/Native macOS (macosArm64) uses plain scalar loops for all tensor operations (`DefaultCpuOps`). On JVM, the same module uses the JDK Vector API @@ -11,71 +11,76 @@ When running LLM inference benchmarks via the `llm-performance` native binary, t is 5-10x slower than it needs to be because every matmul is a triple-nested scalar loop (`DefaultCpuOps.kt:264-272`). -## Proposed solution +=== Proposed solution Add an Accelerate-backed `TensorOps` implementation for the macOS native target, mirroring how the JVM target has `DefaultCpuOpsJvm`. Apple's Accelerate framework provides hardware-optimized BLAS and vector DSP routines that leverage ARM NEON and AMX under the hood. -### Architecture +==== Architecture -``` +.... PlatformCpuOpsFactory β”œβ”€β”€ jvmMain β†’ DefaultCpuOpsJvm (Vector API + optional BLAS) ← exists β”œβ”€β”€ nativeMain β†’ DefaultCpuOps (scalar fallback) ← exists β”œβ”€β”€ macosMain β†’ AccelerateCpuOps (Accelerate framework via cinterop) ← NEW └── linuxMain β†’ DefaultCpuOps (scalar, or OpenBLAS in future) ← unchanged -``` +.... -### Key changes +==== Key changes -**1. Cinterop definition** β€” `src/nativeInterop/cinterop/accelerate.def` +*1. Cinterop definition* β€” `src/nativeInterop/cinterop/accelerate.def` -```def +[source,def] +---- package = platform.accelerate language = C headers = Accelerate/Accelerate.h compilerOpts = -framework Accelerate linkerOpts = -framework Accelerate -``` +---- -**2. New class** β€” `src/macosMain/kotlin/.../AccelerateCpuOps.kt` +*2. New class* β€” `src/macosMain/kotlin/.../AccelerateCpuOps.kt` Extends `DefaultCpuOps` and overrides hot-path operations with Accelerate calls: -| Priority | Operation | Accelerate function | Impact | -|----------|-----------|---------------------|--------| -| P0 | `matmul` | `cblas_sgemm` | Dominant cost in LLM inference (~90% of forward pass) | -| P1 | `add` | `vDSP_vadd` | Elementwise add (residual connections) | -| P1 | `multiply` | `vDSP_vmul` | Elementwise multiply (gates, scaling) | -| P1 | `subtract` | `vDSP_vsub` | Elementwise subtract | -| P1 | `divide` | `vDSP_vdiv` | Elementwise divide | -| P2 | `sum` (global) | `vDSP_sve` | Reduction for normalization | -| P2 | `mean` (global) | `vDSP_meanv` | Reduction for normalization | -| P2 | `softmax` | `vDSP_vse` + manual | Attention weights | -| P3 | `relu` | `vDSP_vthres` / `vDSP_vthr` | Activation function | -| P3 | `silu` | manual vectorized loop | Activation function (SiLU = x * sigmoid(x)) | -| P3 | `transpose` | `vDSP_mtrans` | Matrix transpose | - -**3. Platform factory** β€” update `PlatformCpuOpsFactory` for macOS - -```kotlin +[cols=",,,",options="header",] +|=== +|Priority |Operation |Accelerate function |Impact +|P0 |`matmul` |`cblas++_++sgemm` |Dominant cost in LLM inference (~90% of forward pass) +|P1 |`add` |`vDSP++_++vadd` |Elementwise add (residual connections) +|P1 |`multiply` |`vDSP++_++vmul` |Elementwise multiply (gates, scaling) +|P1 |`subtract` |`vDSP++_++vsub` |Elementwise subtract +|P1 |`divide` |`vDSP++_++vdiv` |Elementwise divide +|P2 |`sum` (global) |`vDSP++_++sve` |Reduction for normalization +|P2 |`mean` (global) |`vDSP++_++meanv` |Reduction for normalization +|P2 |`softmax` |`vDSP++_++vse` {plus} manual |Attention weights +|P3 |`relu` |`vDSP++_++vthres` / `vDSP++_++vthr` |Activation function +|P3 |`silu` |manual vectorized loop |Activation function (SiLU = x ++*++ sigmoid(x)) +|P3 |`transpose` |`vDSP++_++mtrans` |Matrix transpose +|=== + +*3. Platform factory* β€” update `PlatformCpuOpsFactory` for macOS + +[source,kotlin] +---- // src/macosMain/kotlin/.../PlatformCpuOpsFactory.macos.kt internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps { println("[SKaiNET] Using Accelerate-backed CPU operations (ARM NEON + AMX)") return { factory -> AccelerateCpuOps(factory) } } -``` +---- This requires splitting the current `nativeMain` expect/actual into separate `macosMain` and `linuxMain` actuals (the `macosMain` source set already exists in `build.gradle.kts`). -**4. Build changes** β€” `build.gradle.kts` +*4. Build changes* β€” `build.gradle.kts` Add cinterop configuration for macosArm64 (and optionally iosArm64/iosSimulatorArm64): -```kotlin +[source,kotlin] +---- macosArm64 { compilations["main"].cinterops { val accelerate by creating { @@ -83,43 +88,43 @@ macosArm64 { } } } -``` +---- Add linker opts for the Accelerate framework to all macOS/iOS binaries. -### Implementation notes +==== Implementation notes -- `AccelerateCpuOps` should extend `DefaultCpuOps` and override only the operations above. - Non-accelerated operations fall through to the scalar implementation. -- The `matmul` override should handle 2D FP32 tensors with `cblas_sgemm` and delegate - batched/non-float cases to `super.matmul()`. -- `vDSP_*` functions operate on contiguous `FloatArray` buffers. Tensors backed by - `FloatArrayTensorData` can be passed directly; others need a `toFloatArray()` copy. -- Broadcasting logic (e.g., bias add, scalar multiply) should remain in the Kotlin layer - and only dispatch the contiguous inner loop to Accelerate. -- The same approach works for iOS targets (`iosArm64`, `iosSimulatorArm64`) since - Accelerate is available on all Apple platforms. +* `AccelerateCpuOps` should extend `DefaultCpuOps` and override only the operations above. +Non-accelerated operations fall through to the scalar implementation. +* The `matmul` override should handle 2D FP32 tensors with `cblas++_++sgemm` and delegate +batched/non-float cases to `super.matmul()`. +* `vDSP++_*++` functions operate on contiguous `FloatArray` buffers. Tensors backed by +`FloatArrayTensorData` can be passed directly; others need a `toFloatArray()` copy. +* Broadcasting logic (e.g., bias add, scalar multiply) should remain in the Kotlin layer +and only dispatch the contiguous inner loop to Accelerate. +* The same approach works for iOS targets (`iosArm64`, `iosSimulatorArm64`) since +Accelerate is available on all Apple platforms. -### Testing +==== Testing -- Existing `DefaultCpuOps` tests in `commonTest` should pass unchanged (numerical equivalence). -- Add macOS-specific tests verifying Accelerate dispatch actually occurs (e.g., check log output - or add a query method). -- Benchmark comparison: run `llm-performance` native benchmark with the current scalar backend - vs Accelerate backend on the same model. +* Existing `DefaultCpuOps` tests in `commonTest` should pass unchanged (numerical equivalence). +* Add macOS-specific tests verifying Accelerate dispatch actually occurs (e.g., check log output +or add a query method). +* Benchmark comparison: run `llm-performance` native benchmark with the current scalar backend +vs Accelerate backend on the same model. -### Expected impact +==== Expected impact Based on JVM BLAS vs scalar measurements and Apple's published Accelerate performance data: -- **matmul**: 10-50x speedup (NEON + AMX vs scalar loop) -- **elementwise**: 4-8x speedup (NEON vectorization) -- **reductions**: 4-8x speedup (NEON vectorization) -- **overall LLM inference**: 5-20x speedup on native macOS CPU backend +* *matmul*: 10-50x speedup (NEON {plus} AMX vs scalar loop) +* *elementwise*: 4-8x speedup (NEON vectorization) +* *reductions*: 4-8x speedup (NEON vectorization) +* *overall LLM inference*: 5-20x speedup on native macOS CPU backend -### Files to create/modify +==== Files to create/modify -``` +.... skainet-backends/skainet-backend-cpu/ β”œβ”€β”€ build.gradle.kts # add cinterop β”œβ”€β”€ src/nativeInterop/cinterop/accelerate.def # NEW @@ -127,12 +132,12 @@ skainet-backends/skainet-backend-cpu/ β”œβ”€β”€ src/macosMain/kotlin/.../PlatformCpuOpsFactory.macos.kt # NEW β”œβ”€β”€ src/linuxMain/kotlin/.../PlatformCpuOpsFactory.linux.kt # NEW (move from nativeMain) └── src/nativeMain/kotlin/.../PlatformCpuOpsFactory.native.kt # REMOVE (split to platform-specific) -``` +.... -### References +==== References -- JVM SIMD implementation: `src/jvmMain/kotlin/.../DefaultCpuOpsJvm.kt` -- JVM BLAS integration: `src/jvmMain/kotlin/.../JvmBlas.kt` -- Apple Accelerate docs: https://developer.apple.com/documentation/accelerate -- CBLAS reference: https://developer.apple.com/documentation/accelerate/blas -- vDSP reference: https://developer.apple.com/documentation/accelerate/vdsp +* JVM SIMD implementation: `src/jvmMain/kotlin/.../DefaultCpuOpsJvm.kt` +* JVM BLAS integration: `src/jvmMain/kotlin/.../JvmBlas.kt` +* Apple Accelerate docs: https://developer.apple.com/documentation/accelerate +* CBLAS reference: https://developer.apple.com/documentation/accelerate/blas +* vDSP reference: https://developer.apple.com/documentation/accelerate/vdsp diff --git a/docs/ops-docs.adoc b/docs/modules/ROOT/pages/explanation/operator-design.adoc similarity index 96% rename from docs/ops-docs.adoc rename to docs/modules/ROOT/pages/explanation/operator-design.adoc index faa5d75c..6ce1d8d4 100644 --- a/docs/ops-docs.adoc +++ b/docs/modules/ROOT/pages/explanation/operator-design.adoc @@ -101,7 +101,7 @@ Your article must be written in AsciiDoc and include the following sections (use - Show how fragments embed: β€’ An API signature block β€’ A status table by backend - β€’ Pointers (xref::) to human-written math/semantics sections + β€’ Pointers (`xref:`) to human-written math/semantics sections - Provide example AsciiDoc fragment: [source,adoc] @@ -126,7 +126,7 @@ Your article must be written in AsciiDoc and include the following sections (use See xref:theory/matmul.adoc#definition[MatMul semantics] and xref:examples/matmul.adoc#examples[Examples]. ---- -- Demonstrate combining generated and human-written docs via include:: and xref::, with a small folder layout: +- Demonstrate combining generated and human-written docs via `include::` and `xref:`, with a small folder layout: [source,text] ---- docs/ @@ -147,7 +147,7 @@ Your article must be written in AsciiDoc and include the following sections (use β€’ Human-written caveats that reference generated statuses via xref anchors. - Show a synchronization flow as a Mermaid diagram: - [source,mermaid] + [mermaid] ---- flowchart LR A[Operator Interfaces (KMP)] --> B[KSP Processor] @@ -226,7 +226,7 @@ Your article must be written in AsciiDoc and include the following sections (use ---- - Show the KSP-produced JSON excerpt and the corresponding generated AsciiDoc fragment for at least one function (e.g., relu). -- Give a minimal human-written math section for MatMul (dimensions, shapes, complexity), and show how it’s included via xref:: from the generated fragment. +- Give a minimal human-written math section for MatMul (dimensions, shapes, complexity), and show how it is included via `xref:` from the generated fragment. == 8. Summary and Benefits - Summarize benefits: @@ -246,7 +246,7 @@ APPENDIX (OPTIONAL BUT STRONGLY RECOMMENDED) OUTPUT FORMAT REQUIREMENTS - Write the entire article as **AsciiDoc**. -- Use code blocks with language tags: [source,kotlin], [source,gradle], [source,json], [source,adoc], [source,mermaid], [source,plantuml] (PlantUML optional). +- Use code blocks with language tags: [source,kotlin], [source,gradle], [source,json], [source,adoc], [mermaid], [source,plantuml] (PlantUML optional). - Use short paragraphs and bullet lists; avoid filler or marketing language. - Include at least: β€’ One Mermaid diagram (the pipeline). @@ -260,7 +260,7 @@ ACCEPTANCE CHECKLIST (the output must satisfy all) - [ ] Clear definition of β€œreflective documentation” and how it differs from classic docgen. - [ ] KSP plan, annotation semantics, and JSON Schema included. - [ ] Example Operator (TensorOps) + generated metadata + generated AsciiDoc fragment. -- [ ] Demonstrated include:: and xref:: usage. +- [ ] Demonstrated `include::` and `xref:` usage. - [ ] Mermaid pipeline diagram present. - [ ] Gradle/Dokka/AsciiDoctorJ integration details with code. - [ ] Summary articulates benefits and risks. \ No newline at end of file diff --git a/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc b/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc new file mode 100644 index 00000000..2b74c01c --- /dev/null +++ b/docs/modules/ROOT/pages/explanation/perf/java-25-cpu-backend.adoc @@ -0,0 +1,111 @@ +==== Java 25 Advantages for the JVM CPU Backend + +Java 25 (GA September 2025) delivers significant free performance improvements to the +SKaiNET JVM CPU backend through JIT/C2 optimizations, faster Panama FFI, and new GC/startup +features β€” all without requiring code changes. + +===== Compatibility + +The same code, same flags, and same runtime detection work across JDK 21–25: + +* Vector API remains incubator on JDK 25 (JEP 508) β€” identical `jdk.incubator.vector` package. +* Panama FFI finalized in JDK 22; `--enable-preview` is harmless on 22{plus}. +* Runtime detection (`Class.forName`, `Runtime.version()`) works on all versions. +* Build config (`jvmTarget = JVM++_++21`, `options.release.set(21)`) produces compatible bytecode. + +*No special treatment is needed for JDK ++>++= 21 but ++<++ 25.* + +Required flags remain: + +.... +--enable-preview --add-modules jdk.incubator.vector +.... + +[[jit--c2-improvements-mapped-to-skainet-ops]] +===== JIT / C2 improvements mapped to SKaiNET ops + +These are automatic β€” the JIT produces better native code for existing bytecode. + +[cols=",,,",options="header",] +|=== +|Improvement |JDK bug |Speedup |Affected SKaiNET code +|VPointer refactoring for vector loads/stores |https://bugs.openjdk.org/browse/JDK-8350748[JDK-8350748] |up to 14x |All `FloatVector.fromArray` / `fromMemorySegment` loops in `JvmVectorKernels.kt`, `JvmQuantizedVectorKernels.kt` +|SuperWord SIMD enhancement |https://bugs.openjdk.org/browse/JDK-8343685[JDK-8343685] |up to 33x |Same vectorized loops (elementwise, reductions, matmul inner loops) +|`Math.max` / `Math.min` intrinsified for `long` |JDK-8350485 |3–5x |Shape computation, tile clamping in blocked matmul +|=== + +Source files: + +* `skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmVectorKernels.kt` +* `skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt` + +===== Panama FFI improvements + +[cols=",,,",options="header",] +|=== +|Improvement |JDK bug |Speedup |Affected SKaiNET code +|Faster `MemorySegment` allocation |https://bugs.openjdk.org/browse/JDK-8345687[JDK-8345687] |~2x |`MemorySegmentTensorData.kt` (`MemorySegmentTensorDataFactory`), `PagedKvCache.kt` +|`MemorySegment::fill` optimized on AArch64 |https://bugs.openjdk.org/browse/JDK-8354674[JDK-8354674] |~2.5x |Tensor zeroing, blocked matmul result initialization +|=== + +Source files: + +* `skainet-lang/skainet-lang-core/src/jvmMain/kotlin/sk/ainet/lang/tensor/data/MemorySegmentTensorData.kt` +* `skainet-apps/skainet-kllama/src/jvmMain/kotlin/sk/ainet/apps/kllama/PagedKvCache.kt` + +===== Object layout and GC + +* *Compact Object Headers* (JEP 519) β€” reduces object header from 12 to 8 bytes. +Meaningful for tensor metadata arrays with millions of small objects. +Opt-in: `-XX:{plus}UseCompactObjectHeaders` +* *Generational Shenandoah* (JEP 521) β€” lower GC pause times for allocation-heavy +workloads (tensor creation, KV cache churn). +Opt-in: `-XX:{plus}UseShenandoahGC -XX:ShenandoahGCMode=generational` + +===== Startup and warmup + +* *AOT profiling / caching* (JEP 515) β€” records JIT profile data from a training run +and replays it on subsequent launches. Reduces warmup by 15–25%. +Useful for CLI apps like kLLaMA where first-token latency matters. + +Usage: + +.... +# Training run (records profile) +java -XX:AOTCacheOutput=app.aot -jar kllama.jar --prompt "warmup" + +# Production run (replays profile) +java -XX:AOTCache=app.aot -jar kllama.jar --prompt "Hello" +.... + +===== Recommended JVM flags for Java 25 + +Required (same as JDK 21–24): + +.... +--enable-preview +--add-modules jdk.incubator.vector +.... + +Optional β€” enable for maximum benefit on JDK 25: + +.... +-XX:+UseCompactObjectHeaders +-XX:+UseShenandoahGC -XX:ShenandoahGCMode=generational +-XX:AOTCache=app.aot # after training run +.... + +===== Summary + +[cols=",,",options="header",] +|=== +|Feature |Benefit |Component +|VPointer refactoring (C2) |Up to 14x faster vector loads/stores |`JvmVectorKernels`, `JvmQuantizedVectorKernels` +|SuperWord SIMD (C2) |Up to 33x faster auto-vectorized loops |Same vector kernel files +|`Math.max/min` intrinsic |3–5x faster long comparisons |Shape computation, tile clamping +|Faster segment allocation |~2x allocation throughput |`MemorySegmentTensorDataFactory`, `PagedKvCache` +|`MemorySegment::fill` (AArch64) |~2.5x faster bulk zeroing |Tensor init, matmul result buffers +|Compact Object Headers |~30% smaller object headers |All tensor metadata +|Generational Shenandoah |Lower GC pauses |Allocation-heavy inference +|AOT profiling |15–25% faster warmup |CLI apps (kLLaMA) +|=== diff --git a/docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc b/docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc new file mode 100644 index 00000000..167aac22 --- /dev/null +++ b/docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc @@ -0,0 +1,110 @@ +==== JVM CPU Backend Performance Benchmarks (JMH) + +This page explains how to run the JMH benchmarks for the JVM CPU backend and how to capture evidence for performance targets. + +===== What’s included + +* Elementwise: FP32 `add` on 1,000,000 elements +* Reductions: FP32 `sum` and `mean` on 1,000,000 elements +* Matmul: FP32 square `matmul` with sizes 256, 512, and 1024 + +Benchmarks are implemented in module: + +* `:skainet-backends:benchmarks:jvm-cpu-jmh` + +Source files: + +* `src/jmh/kotlin/sk/ainet/bench/ElementwiseAdd1MBench.kt` +* `src/jmh/kotlin/sk/ainet/bench/Reductions1MBench.kt` +* `src/jmh/kotlin/sk/ainet/bench/MatmulBench.kt` + +===== Prerequisites + +* JDK 21{plus} (JDK 22 toolchain configured by Gradle) +* Gradle will pass required JVM flags: +** `--enable-preview` +** `--add-modules jdk.incubator.vector` + +For Java 25-specific performance advantages, see link:java-25-cpu-backend.md[Java 25 CPU Backend]. + +===== Feature flags + +You can toggle acceleration paths at runtime using system properties or environment variables: + +* Vector acceleration: +** `-Dskainet.cpu.vector.enabled=true++|++false` +** or `SKAINET++_++CPU++_++VECTOR++_++ENABLED=true++|++false` +* BLAS via Panama (matmul heuristic for larger sizes): +** `-Dskainet.cpu.blas.enabled=true++|++false` +** or `SKAINET++_++CPU++_++BLAS++_++ENABLED=true++|++false` + +Each benchmark also exposes `@Param` to toggle these flags without modifying Gradle args. + +===== How to run all benchmarks + +From repository root: + +.... +./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh +.... + +This will build and execute all JMH benchmarks with the default parameters defined in sources. + +===== Run specific benchmarks + +* Elementwise add (both vector on/off): + +.... +./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \ + -Pjmh.include=ElementwiseAdd1MBench +.... + +* Reductions (vector on/off): + +.... +./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \ + -Pjmh.include=Reductions1MBench +.... + +* Matmul, all sizes, with vector on and BLAS on: + +.... +./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \ + -Pjmh.include=MatmulBench \ + -Pjmh.param.vectorEnabled=true \ + -Pjmh.param.blasEnabled=true +.... + +* Matmul at 512 only, comparing BLAS on/off with vector on: + +.... +./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \ + -Pjmh.include=MatmulBench \ + -Pjmh.param.size=512 \ + -Pjmh.param.vectorEnabled=true \ + -Pjmh.param.blasEnabled=true,false +.... + +Notes: + +* You can also pass system properties via `-D` if preferred (e.g., `-Dskainet.cpu.vector.enabled=false`). +* JMH JSON/text results can be configured via standard JMH plugin options if you need files for CI artifacts. + +===== Recording environment details + +Include at minimum: + +* CPU model, cores/threads, base/boost clock +* RAM size and speed +* OS version +* JDK version and vendor +* Gradle version +* JVM flags in use (`--enable-preview --add-modules jdk.incubator.vector`) +* SKaiNET flags used (vector, BLAS) + +===== Performance targets (to be validated on your hardware) + +* β‰₯ 4Γ— speedup on FP32 `matmul` 512Γ—512 vs baseline scalar +* β‰₯ 3Γ— speedup on FP32 `add` with 1M elements vs baseline scalar + +Use the above commands to produce β€œvector=false/blas=false” baselines vs β€œvector=true++[++/blas=true++]++” accelerated runs. Capture best-of or median-of JMH results as evidence and include raw tables in this document when available. diff --git a/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc b/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc new file mode 100644 index 00000000..102aa5ac --- /dev/null +++ b/docs/modules/ROOT/pages/explanation/skainet-for-ai.adoc @@ -0,0 +1,143 @@ +[[skainet-core-technology-tensor--data-guide]] +== SKaiNET Core Technology: Tensor & Data Guide + +This document provides technical instructions for AI agents and developers on using SKaiNET's Tensor and Data API as a modern, type-safe replacement for NDArray or Python's NumPy library. + +[[1-fundamental-architecture-tensor-composition]] +=== 1. Fundamental Architecture: Tensor Composition + +Unlike traditional libraries where a Tensor is a monolithic object, SKaiNET adopts a *compositional architecture*. A `Tensor++<++T, V++>++` is composed of two primary components: + +[arabic] +. *`TensorData++<++T, V++>++`*: Handles multi-dimensional storage, memory layout, indexing, and type-safe element access. +. *`TensorOps`*: Encapsulates mathematical algorithms and transformations (CPU, GPU, etc.). + +This separation allows for high flexibility, such as switching execution backends without changing the data representation. + +[source,kotlin] +---- +interface Tensor { + val data: TensorData + val ops: TensorOps + val dtype: KClass + val shape: Shape +} +---- + +[[2-type-safe-tensor-creation-dsl]] +=== 2. Type-Safe Tensor Creation (DSL) + +SKaiNET provides a powerful Type-Safe DSL for tensor creation. It ensures that the data provided matches the specified `DType` at compile-time (or through the DSL's internal validation). + +==== Creation with `ExecutionContext` + +Tensors are always created within an `ExecutionContext`, which provides the necessary `TensorOps` and `TensorDataFactory`. + +[source,kotlin] +---- +// Basic creation +val zeros = ctx.zeros(Shape(2, 3), FP32::class) +val ones = ctx.ones(Shape(1, 10), Int32::class) +val full = ctx.full(Shape(5, 5), FP32::class, 42.0f) +---- + +==== Expressive Tensor DSL + +For more complex initializations, use the `tensor` DSL: + +[source,kotlin] +---- +val myTensor = tensor(ctx, FP32::class) { + shape(2, 2) { + from(1.0f, 2.0f, 3.0f, 4.0f) + } +} + +val randomTensor = tensor(ctx, FP32::class) { + shape(10, 10) { + randn(mean = 0f, std = 1f) + } +} + +val customInit = tensor(ctx, Int32::class) { + shape(5, 5) { + init { indices -> indices[0] + indices[1] } + } +} +---- + +[[3-slicing-dsl-api]] +=== 3. Slicing DSL API + +SKaiNET offers a sophisticated Slicing DSL that allows for creating views or copies of tensor segments with high precision and readability. + +==== `sliceView` vs `sliceCopy` + +* *`sliceView`*: Creates a `TensorView`, which is a window into the original data (no data copying). +* *`sliceCopy`*: Creates a new `Tensor` with a copy of the sliced data. + +==== Slicing DSL Syntax + +The `SegmentBuilder` provides several ways to define slices for each dimension: + +* `range(start, end)`: A range of indices. +* `at(index)`: A single index (reduces rank). +* `all()`: All elements in that dimension (equivalent to `:` in NumPy). +* `step(start, end, step)`: Strided access. +* `{plus}all()`: Short-hand for `all()`. + +[source,kotlin] +---- +val source = ctx.ones(Shape(10, 20, 30), FP32::class) + +// Slicing: [0:5, 10, :] +val view = source.sliceView { + segment { range(0, 5) } // Dim 0 + segment { at(10) } // Dim 1 + segment { all() } // Dim 2 +} +---- + +[[4-core-operations-tensorops]] +=== 4. Core Operations (`TensorOps`) + +All mathematical operations are dispatched through the `TensorOps` interface. SKaiNET supports: + +* *Element-wise Ops*: `add`, `subtract`, `multiply`, `divide` (and scalar versions). +* *Linear Algebra*: `matmul`, `transpose`. +* *Neural Network Ops*: `conv2d`, `maxPool2d`, `relu`, `softmax`, `sigmoid`, `gelu`. +* *Reductions*: `sum`, `mean`, `variance`. +* *Shape Ops*: `reshape`, `flatten`, `concat`, `squeeze`, `unsqueeze`. + +==== Operator Overloading + +When a tensor is "bound" to ops (e.g., via `OpsBoundTensor`), you can use standard Kotlin operators: + +[source,kotlin] +---- +val c = a + b // Calls ops.add(a, b) +val d = a * 10 // Calls ops.mulScalar(a, 10) +---- + +[[5-summary-table-skainet-vs-numpy]] +=== 5. Summary Table: SKaiNET vs NumPy + +[cols="<,<,<",options="header",] +|=== +|Feature |NumPy |SKaiNET +|*Primary Type* |`ndarray` |`Tensor++<++T, V++>++` +|*Creation* |`np.array(++[++1, 2, 3++]++)` |`tensor(ctx, FP32::class) ++{++ shape(3) ++{++ from(1f, 2f, 3f) } }` +|*Zeros* |`np.zeros((2, 2))` |`ctx.zeros(Shape(2, 2), FP32::class)` +|*Slicing* |`a++[++0:5, :++]++` |`a.sliceView ++{++ segment ++{++ range(0, 5) }; segment ++{++ all() } }` +|*Matmul* |`a @ b` or `np.matmul(a, b)` |`ctx.ops.matmul(a, b)` +|*Reshape* |`a.reshape(new++_++shape)` |`ctx.ops.reshape(a, Shape(new++_++shape))` +|=== + +[[6-best-practices-for-ai-integration]] +=== 6. Best Practices for AI Integration + +[arabic] +. *Context Awareness*: Always pass the `ExecutionContext` to functions that create or manipulate tensors. +. *Type Safety*: Prefer specific `DType` classes (e.g., `FP32::class`, `Int32::class`) to avoid runtime errors. +. *Views over Copies*: Use `sliceView` whenever possible to minimize memory overhead and improve performance. +. *Backend Agnostic*: Write logic against the `TensorOps` interface to ensure your code runs on any supported backend. diff --git a/docs/theory/index.adoc b/docs/modules/ROOT/pages/explanation/theory/index.adoc similarity index 80% rename from docs/theory/index.adoc rename to docs/modules/ROOT/pages/explanation/theory/index.adoc index e82bd082..80917802 100644 --- a/docs/theory/index.adoc +++ b/docs/modules/ROOT/pages/explanation/theory/index.adoc @@ -5,10 +5,6 @@ This section contains mathematical definitions and theoretical foundations for S [#operator-theory] == Operator Theory -=== Architecture - -include::composite-ops.adoc[leveloffset=+2] - === Linear Algebra Operations include::matmul.adoc[leveloffset=+2] @@ -36,5 +32,5 @@ include::matmul.adoc[leveloffset=+2] [#cross-references] == Cross-References -* xref:../examples/index.adoc[Usage Examples] -* xref:../modules/operators/_generated_/index.adoc[Generated API Reference] \ No newline at end of file +* xref:explanation/examples/index.adoc[Usage Examples] +// Operator reference lands in a later commit of the Antora migration. \ No newline at end of file diff --git a/docs/theory/matmul.adoc b/docs/modules/ROOT/pages/explanation/theory/matmul.adoc similarity index 100% rename from docs/theory/matmul.adoc rename to docs/modules/ROOT/pages/explanation/theory/matmul.adoc diff --git a/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc b/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc new file mode 100644 index 00000000..7ef1165c --- /dev/null +++ b/docs/modules/ROOT/pages/how-to/arduino-c-codegen.adoc @@ -0,0 +1,95 @@ +== Arduino C Code Generation + +SKaiNET provides a specialized compiler backend for exporting trained neural networks to highly optimized, standalone C99 code suitable for microcontrollers like Arduino. + +=== Overview + +The Arduino C code generation process transforms a high-level Kotlin model into a memory-efficient C implementation. It prioritizes static memory allocation, minimal overhead, and numerical consistency with the original model. + +==== Codegen Pipeline + +[mermaid] +---- +graph TD + A[Kotlin Model] --> B[Recording Pass] + B --> C[Execution Tape] + C --> D[Compute Graph] + D --> E[Graph Validation] + E --> F[Memory Layout Calculation] + F --> G[C Code Emission] + G --> H[Arduino Library Packaging] + H --> I[Generated .h/.c files] +---- + +=== Technical Deep Dive + +[[1-tape-based-tracing]] +==== 1. Tape-based Tracing + +Instead of static analysis of the Kotlin code, SKaiNET uses a dynamic tracing mechanism. When you call `exportToArduinoLibrary`, the framework executes a single forward pass of your model using a specialized `RecordingContext`. + +* Every operation (Dense, ReLU, etc.) is recorded onto an *Execution Tape*. +* This approach handles Kotlin's language features (loops, conditionals) naturally, as it only records the actual operations that were executed. + +[[2-compute-graph-construction]] +==== 2. Compute Graph Construction + +The execution tape is converted into a directed acyclic graph (DAG) called `ComputeGraph`. + +* Nodes represent operations (Ops). +* Edges represent data flow (Tensors). +* During this phase, the compiler performs *Shape Inference* to ensure every tensor has a fixed, known size. + +[[3-static-memory-management]] +==== 3. Static Memory Management + +Microcontrollers typically have very limited RAM and lack robust heap management. SKaiNET uses a *Ping-Pong Buffer Strategy* to eliminate dynamic memory allocation (`malloc`/`free`) during inference. + +===== Ping-Pong Buffer Strategy + +The compiler calculates the maximum size required for any intermediate tensor in the graph and allocates exactly two static buffers of that size. + +[mermaid] +---- +sequenceDiagram + participant I as Input + participant B1 as Buffer A + participant B2 as Buffer B + participant O as Output + + I->>B1: Layer 1 (Input -> A) + B1->>B2: Layer 2 (A -> B) + B2->>B1: Layer 3 (B -> A) + B1->>O: Layer 4 (A -> Output) +---- + +* *Buffer Reuse*: Instead of allocating space for every layer's output, buffers are reused. +* *Direct Output Optimization*: The first layer reads from the input pointer, and the last layer writes directly to the output pointer, avoiding unnecessary copies. + +[[4-code-generation-emission]] +==== 4. Code Generation (Emission) + +The `CCodeGenerator` emits C99-compatible code using templates. + +* *Weights & Biases*: Extracted from the trained Kotlin model and serialized as `static const float` arrays. This places them in Flash memory (PROGMEM) on many microcontrollers, saving precious RAM. +* *Kernel Implementation*: Operations like `Dense` (Linear) are implemented as optimized nested loops. +* *Header Generation*: Produces a clean API for the user: ++ +[source,c] +---- +int model_inference(const float* input, float* output); +---- + +[[5-validation]] +==== 5. Validation + +The generator performs post-generation validation: + +* *Static Allocation Check*: Ensures no dynamic allocation is present in the generated source. +* *Buffer Alternation Check*: Verifies that the ping-pong strategy is correctly implemented without data races or overwrites. + +=== Performance and Constraints + +* *Floating Point*: Currently optimized for `FP32`. +* *Supported Ops*: `Dense`, `ReLU`, `Sigmoid`, `Tanh`, `Add`, `MatMul`. +* *Memory*: Total memory consumption is `TotalWeights {plus} 2 ++*++ MaxIntermediateTensor`. diff --git a/docs/modules/ROOT/pages/how-to/build.adoc b/docs/modules/ROOT/pages/how-to/build.adoc new file mode 100644 index 00000000..c3a6b6d6 --- /dev/null +++ b/docs/modules/ROOT/pages/how-to/build.adoc @@ -0,0 +1,87 @@ +== Build Help + +=== Dokka API Documentation + +SKaiNET uses https://github.com/Kotlin/dokka[Dokka 2.1.0] to generate API reference documentation across all public library modules. A shared convention plugin (`sk.ainet.dokka`) standardises the configuration. + +==== Generating docs locally + +*Single module:* + +[source,bash] +---- +./gradlew :skainet-lang:skainet-lang-core:dokkaGeneratePublicationHtml +---- + +Output: `skainet-lang/skainet-lang-core/build/dokka/html/` + +*Aggregated (all modules):* + +[source,bash] +---- +./gradlew dokkaGenerate +---- + +Output: `build/dokka/html/index.html` + +==== Convention plugin details + +The `sk.ainet.dokka` precompiled script plugin (`build-logic/convention/src/main/kotlin/sk.ainet.dokka.gradle.kts`) applies `org.jetbrains.dokka` and configures: + +* *moduleName* from `project.name` +* *moduleVersion* from the `VERSION++_++NAME` Gradle property +* *Documented visibilities:* public only +* *Suppressed generated files:* KSP-generated code is excluded +* *Suppressed native source sets:* `iosArm64Main`, `iosSimulatorArm64Main`, `macosArm64Main`, `linuxX64Main`, `linuxArm64Main` are suppressed because Dokka 2.x cannot translate native cinterop symbols +* *Source links* pointing to the GitHub repository + +==== Modules with Dokka enabled + +The plugin is applied to 21 library modules: + +[cols=",",options="header",] +|=== +|Group |Modules +|skainet-lang |`skainet-lang-core`, `skainet-lang-models`, `skainet-lang-ksp-annotations`, `skainet-lang-dag` +|skainet-compile |`skainet-compile-core`, `skainet-compile-dag`, `skainet-compile-json`, `skainet-compile-hlo`, `skainet-compile-c` +|skainet-backends |`skainet-backend-cpu` +|skainet-data |`skainet-data-api`, `skainet-data-transform`, `skainet-data-simple`, `skainet-data-media` +|skainet-io |`skainet-io-core`, `skainet-io-gguf`, `skainet-io-image`, `skainet-io-onnx`, `skainet-io-safetensors` +|Other |`skainet-pipeline`, `skainet-model-yolo` +|=== + +*Excluded:* `skainet-bom` (no source), `skainet-apps/++*++`, `skainet-test/++*++`, benchmarks, and `skainet-lang-ksp-processor` (internal). + +==== Root-level aggregation + +The root `build.gradle.kts` applies the Dokka plugin directly (not `apply false`) and declares `dokka(project(...))` dependencies for all 21 modules. Running `./gradlew dokkaGenerate` at the root produces a unified API reference that includes every module under a single `SKaiNET` namespace. The root `README.md` is included as the landing page. + +==== KSP interaction + +`skainet-lang-core` and `skainet-lang-dag` use KSP to generate source code. Their build files include: + +[source,kotlin] +---- +tasks.matching { it.name.startsWith("dokka") }.configureEach { + dependsOn("kspCommonMainKotlinMetadata") +} +---- + +This ensures KSP-generated sources are available before Dokka runs. + +==== GitHub Pages deployment + +The workflow `.github/workflows/dokka-pages.yml` runs on push to `main` (and manually via `workflow++_++dispatch`). It: + +[arabic] +. Checks out the repo +. Sets up JDK 25 +. Runs `./gradlew dokkaGenerate` +. Uploads the `build/dokka/html` directory as a Pages artifact +. Deploys to GitHub Pages using `actions/deploy-pages@v4` + +*Prerequisite:* The repository must have Pages configured to deploy from GitHub Actions (Settings ++>++ Pages ++>++ Source: "GitHub Actions"). + +==== Operator docs (unchanged) + +The existing operator documentation pipeline (`./gradlew generateDocs`) is unrelated to Dokka and continues to work as before. diff --git a/docs/io-readers-guide.md b/docs/modules/ROOT/pages/how-to/io-readers.adoc similarity index 88% rename from docs/io-readers-guide.md rename to docs/modules/ROOT/pages/how-to/io-readers.adoc index d431a7c3..1f4b18da 100644 --- a/docs/io-readers-guide.md +++ b/docs/modules/ROOT/pages/how-to/io-readers.adoc @@ -1,48 +1,54 @@ -# SKaiNET I/O Readers Guide +== SKaiNET I/O Readers Guide This guide demonstrates how to use SKaiNET's GGUF and ONNX readers in your Kotlin Multiplatform projects. -## Overview +=== Overview SKaiNET provides two main I/O modules for reading AI model formats: -- **skainet-io-gguf**: For reading GGUF (GPT-Generated Unified Format) files -- **skainet-io-onnx**: For reading ONNX (Open Neural Network Exchange) files + +* *skainet-io-gguf*: For reading GGUF (GPT-Generated Unified Format) files +* *skainet-io-onnx*: For reading ONNX (Open Neural Network Exchange) files Both modules are built on Kotlin Multiplatform and support JVM, Android, iOS, JS, WASM, and Native platforms. -## Dependencies +=== Dependencies Add the following dependencies to your `build.gradle.kts`: -### For GGUF Support +==== For GGUF Support -```kotlin +[source,kotlin] +---- dependencies { implementation("sk.ainet.core:skainet-io-gguf:0.5.0") implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2") } -``` +---- -### For ONNX Support +==== For ONNX Support -```kotlin +[source,kotlin] +---- dependencies { implementation("sk.ainet.core:skainet-io-onnx:0.5.0") implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2") implementation("pro.streem.pbandk:pbandk-runtime:0.16.0") } -``` +---- -## GGUF Reader Usage +=== GGUF Reader Usage -> **Recommended:** For large model files, use `StreamingGGUFReader` instead of `GGUFReader`. -> The streaming reader parses only metadata (~1 MB) and loads tensors on-demand, supporting -> files over 100 GB without heap-loading the entire file. It also supports quantized types -> (Q4_K, Q8_0, etc.) via `StreamingGgufParametersLoader`. See the streaming examples below. +____ +*Recommended:* For large model files, use `StreamingGGUFReader` instead of `GGUFReader`. +The streaming reader parses only metadata (~1 MB) and loads tensors on-demand, supporting +files over 100 GB without heap-loading the entire file. It also supports quantized types +(Q4++_++K, Q8++_++0, etc.) via `StreamingGgufParametersLoader`. See the streaming examples below. +____ -### Streaming GGUF Reading (Recommended) +==== Streaming GGUF Reading (Recommended) -```kotlin +[source,kotlin] +---- import sk.ainet.io.JvmRandomAccessSource import sk.ainet.io.gguf.StreamingGGUFReader @@ -60,16 +66,19 @@ fun readLargeModel(filePath: String) { println("Encoding: ${storage.encoding.name}, Physical: ${storage.physicalBytes} bytes") } } -``` +---- -### Legacy GGUF Reading +==== Legacy GGUF Reading -> **Note:** The legacy `GGUFReader` loads the entire file into memory and only supports -> F32/I32 tensors. Prefer `StreamingGGUFReader` for new code. +____ +*Note:* The legacy `GGUFReader` loads the entire file into memory and only supports +F32/I32 tensors. Prefer `StreamingGGUFReader` for new code. +____ -### Basic GGUF Reading +==== Basic GGUF Reading -```kotlin +[source,kotlin] +---- import kotlinx.io.Source import kotlinx.io.asSource import kotlinx.io.buffered @@ -114,11 +123,12 @@ suspend fun readGGUFModel(filePath: String) { } } } -``` +---- -### Working with Tensor Data +==== Working with Tensor Data -```kotlin +[source,kotlin] +---- import sk.ainet.io.gguf.GGUFReader import sk.ainet.io.gguf.GGMLQuantizationType @@ -155,11 +165,12 @@ fun processTensorData(reader: GGUFReader) { } } } -``` +---- -### Lazy Loading for Large Models +==== Lazy Loading for Large Models -```kotlin +[source,kotlin] +---- import sk.ainet.io.gguf.GGUFReader fun readGGUFMetadataOnly(filePath: String) { @@ -184,13 +195,14 @@ fun readGGUFMetadataOnly(filePath: String) { } } } -``` +---- -## ONNX Reader Usage +=== ONNX Reader Usage -### Basic ONNX Reading +==== Basic ONNX Reading -```kotlin +[source,kotlin] +---- import kotlinx.io.Source import kotlinx.io.asSource import sk.ainet.io.onnx.OnnxLoader @@ -229,11 +241,12 @@ suspend fun readONNXModel(filePath: String) { println(" Outputs: ${graph.output.size}") } } -``` +---- -### Working with ONNX Graph Structure +==== Working with ONNX Graph Structure -```kotlin +[source,kotlin] +---- import onnx.ModelProto import onnx.NodeProto import onnx.TensorProto @@ -286,11 +299,12 @@ fun getAttributeValue(attr: onnx.AttributeProto): String { fun getTensorShapeString(tensor: TensorProto): String { return tensor.dims.joinToString("x") { it.toString() } } -``` +---- -### Custom ONNX Loader with Error Handling +==== Custom ONNX Loader with Error Handling -```kotlin +[source,kotlin] +---- import kotlinx.io.Source import sk.ainet.io.onnx.OnnxLoader import sk.ainet.io.onnx.OnnxLoadedModel @@ -352,13 +366,14 @@ suspend fun safeLoadOnnx(filePath: String) { println("Failed to load ONNX model: ${error.message}") } } -``` +---- -## Platform-Specific Considerations +=== Platform-Specific Considerations -### JVM Platform +==== JVM Platform -```kotlin +[source,kotlin] +---- // JVM-specific file reading import java.io.File import java.nio.file.Path @@ -366,11 +381,12 @@ import java.nio.file.Path fun readFromFile(path: Path): Source { return path.toFile().inputStream().asSource().buffered() } -``` +---- -### Android Platform +==== Android Platform -```kotlin +[source,kotlin] +---- // Android-specific asset reading import android.content.Context import android.content.res.AssetManager @@ -378,11 +394,12 @@ import android.content.res.AssetManager fun readFromAssets(context: Context, fileName: String): Source { return context.assets.open(fileName).asSource().buffered() } -``` +---- -### iOS/Native Platform +==== iOS/Native Platform -```kotlin +[source,kotlin] +---- // Native platform file reading import kotlinx.io.files.Path import kotlinx.io.files.SystemFileSystem @@ -391,13 +408,14 @@ fun readFromNativePath(pathString: String): Source { val path = Path(pathString) return SystemFileSystem.source(path).buffered() } -``` +---- -## Performance Tips +=== Performance Tips -### Memory Management +==== Memory Management -```kotlin +[source,kotlin] +---- // For large models, consider streaming or chunked processing fun processLargeModel(reader: GGUFReader) { // Process tensors one at a time to manage memory @@ -411,11 +429,12 @@ fun processLargeModel(reader: GGUFReader) { } } } -``` +---- -### Lazy Loading Strategy +==== Lazy Loading Strategy -```kotlin +[source,kotlin] +---- class ModelManager { private var reader: GGUFReader? = null private val tensorCache = mutableMapOf>() @@ -431,11 +450,12 @@ class ModelManager { } } } -``` +---- -## Error Handling Best Practices +=== Error Handling Best Practices -```kotlin +[source,kotlin] +---- sealed class ModelLoadResult { data class Success(val model: T) : ModelLoadResult() data class Error(val message: String, val cause: Throwable? = null) : ModelLoadResult() @@ -459,13 +479,14 @@ suspend fun loadModelSafely(filePath: String): ModelLoadResult { ModelLoadResult.Error("Failed to load model: ${e.message}", e) } } -``` +---- -## Integration Examples +=== Integration Examples -### Using with Coroutines +==== Using with Coroutines -```kotlin +[source,kotlin] +---- import kotlinx.coroutines.* class AsyncModelLoader { @@ -495,6 +516,6 @@ class AsyncModelLoader { } data class ProcessedTensor(val name: String, val size: Int) -``` +---- -This guide provides comprehensive examples for using SKaiNET's I/O readers in your projects. The readers are designed to be efficient, multiplatform-compatible, and easy to integrate into existing Kotlin applications. \ No newline at end of file +This guide provides comprehensive examples for using SKaiNET's I/O readers in your projects. The readers are designed to be efficient, multiplatform-compatible, and easy to integrate into existing Kotlin applications. diff --git a/docs/java-cli-app.md b/docs/modules/ROOT/pages/how-to/java-cli-app.adoc similarity index 85% rename from docs/java-cli-app.md rename to docs/modules/ROOT/pages/how-to/java-cli-app.adoc index c2288a5d..a233942d 100644 --- a/docs/java-cli-app.md +++ b/docs/modules/ROOT/pages/how-to/java-cli-app.adoc @@ -1,22 +1,23 @@ -# Building a Java CLI App with KLlama +== Building a Java CLI App with KLlama -This guide walks you through creating a standalone Java 21+ command-line application that loads a LLaMA model and generates text using the KLlama library. +This guide walks you through creating a standalone Java 21{plus} command-line application that loads a LLaMA model and generates text using the KLlama library. -## Prerequisites +=== Prerequisites -- **JDK 21 or later** (required for Vector API and virtual threads) -- **Maven 3.8+** or **Gradle 8.4+** -- A GGUF model file (e.g., [TinyLlama-1.1B-Chat GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF)) +* *JDK 21 or later* (required for Vector API and virtual threads) +* *Maven 3.8{plus}* or *Gradle 8.4{plus}* +* A GGUF model file (e.g., https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF[TinyLlama-1.1B-Chat GGUF]) ---- +''''' -## Project Setup +=== Project Setup -### Maven +==== Maven Create a `pom.xml`: -```xml +[source,xml] +---- System.out.print(token)); System.out.println(); } -``` +---- ---- +''''' -## Async Generation +=== Async Generation Use `generateAsync` to run generation on a virtual thread and get a `CompletableFuture`: -```java +[source,java] +---- import java.util.concurrent.CompletableFuture; try (KLlamaSession session = KLlamaJava.loadGGUF(modelPath)) { @@ -262,20 +269,21 @@ try (KLlamaSession session = KLlamaJava.loadGGUF(modelPath)) { String result = future.join(); System.out.println(result); } -``` +---- You can also compose futures: -```java +[source,java] +---- session.generateAsync("Translate to French: Hello world") .thenAccept(translation -> System.out.println("Translation: " + translation)) .exceptionally(ex -> { ex.printStackTrace(); return null; }); -``` +---- ---- +''''' -## Next Steps +=== Next Steps -- [Java LLM Inference Guide](java-llm-inference.md) β€” BERT embeddings, agent/tool-calling, and more. -- [Java Getting Started](java-getting-started.md) β€” tensor operations, full Maven/Gradle setup. -- [KLlama Library](../skainet-apps/skainet-kllama/README.md) β€” custom backends and Kotlin embedding. +* link:java-llm-inference.md[Java LLM Inference Guide] β€” BERT embeddings, agent/tool-calling, and more. +* link:java-getting-started.md[Java Getting Started] β€” tensor operations, full Maven/Gradle setup. +* link:../skainet-apps/skainet-kllama/README.md[KLlama Library] β€” custom backends and Kotlin embedding. diff --git a/docs/java-llm-inference.md b/docs/modules/ROOT/pages/how-to/java-llm-inference.adoc similarity index 78% rename from docs/java-llm-inference.md rename to docs/modules/ROOT/pages/how-to/java-llm-inference.adoc index feb94244..567b9aa1 100644 --- a/docs/java-llm-inference.md +++ b/docs/modules/ROOT/pages/how-to/java-llm-inference.adoc @@ -1,15 +1,16 @@ -# Java LLM Inference Guide +== Java LLM Inference Guide This guide covers loading and running large language models (LLaMA, BERT) from Java using SKaiNET's blocking, streaming, and async APIs. -## Prerequisites +=== Prerequisites -- JDK 21+ with `--enable-preview --add-modules jdk.incubator.vector` -- See [Java Getting Started](java-getting-started.md) for project setup +* JDK 21{plus} with `--enable-preview --add-modules jdk.incubator.vector` +* See link:java-getting-started.md[Java Getting Started] for project setup -### Maven Dependencies +==== Maven Dependencies -```xml +[source,xml] +---- @@ -47,19 +48,20 @@ This guide covers loading and running large language models (LLaMA, BERT) from J skainet-backend-cpu-jvm -``` +---- ---- +''''' -## LLaMA Inference +=== LLaMA Inference All LLaMA Java classes live in `sk.ainet.apps.kllama.java`. -### Loading a GGUF Model +==== Loading a GGUF Model The simplest way to get started is to load a GGUF file. `KLlamaJava.loadGGUF()` handles context creation, weight loading, quantization dispatch, and tokenizer setup behind the scenes. -```java +[source,java] +---- import sk.ainet.apps.kllama.java.KLlamaJava; import sk.ainet.apps.kllama.java.KLlamaSession; import sk.ainet.apps.kllama.java.GenerationConfig; @@ -73,50 +75,54 @@ public class LlamaExample { } } } -``` +---- `KLlamaSession` implements `AutoCloseable`, so `try-with-resources` properly releases the off-heap memory arenas when you are done. -### Loading SafeTensors (HuggingFace Format) +==== Loading SafeTensors (HuggingFace Format) If you have a HuggingFace model directory containing `model.safetensors`, `config.json`, and `tokenizer.json`: -```java +[source,java] +---- try (KLlamaSession session = KLlamaJava.loadSafeTensors(Path.of("./my-llama-model/"))) { String response = session.generate("Once upon a time"); System.out.println(response); } -``` +---- The directory must contain: -- `model.safetensors` -- the model weights -- `config.json` -- model architecture config (hidden size, layers, heads, etc.) -- `tokenizer.json` -- HuggingFace tokenizer definition ---- +* `model.safetensors` -- the model weights +* `config.json` -- model architecture config (hidden size, layers, heads, etc.) +* `tokenizer.json` -- HuggingFace tokenizer definition -## GenerationConfig +''''' + +=== GenerationConfig Control generation parameters with the builder pattern: -```java +[source,java] +---- GenerationConfig config = GenerationConfig.builder() .maxTokens(256) // maximum tokens to generate (default: 256) .temperature(0.7f) // sampling temperature (default: 0.8) .build(); String response = session.generate("Explain quantum computing", config); -``` +---- Use `GenerationConfig.defaults()` for the default configuration (256 max tokens, 0.8 temperature). ---- +''''' -## Streaming Generation +=== Streaming Generation -Pass a `Consumer` to receive each token as it is generated. This is useful for displaying output in real time: +Pass a `Consumer++<++String++>++` to receive each token as it is generated. This is useful for displaying output in real time: -```java +[source,java] +---- GenerationConfig config = GenerationConfig.builder() .maxTokens(512) .temperature(0.9f) @@ -129,17 +135,18 @@ String fullResponse = session.generate( ); System.out.println(); // newline after streaming -``` +---- -The `generate` overload with a `Consumer` still returns the complete generated text as its return value. +The `generate` overload with a `Consumer++<++String++>++` still returns the complete generated text as its return value. ---- +''''' -## Async Generation +=== Async Generation -`generateAsync` offloads generation to a virtual thread and returns a `CompletableFuture`: +`generateAsync` offloads generation to a virtual thread and returns a `CompletableFuture++<++String++>++`: -```java +[source,java] +---- import java.util.concurrent.CompletableFuture; CompletableFuture future = session.generateAsync( @@ -150,27 +157,29 @@ CompletableFuture future = session.generateAsync( // Do other work while generation runs... String result = future.join(); // block when you need the result System.out.println(result); -``` +---- You can also compose futures: -```java +[source,java] +---- session.generateAsync("Translate to French: Hello world") .thenAccept(translation -> System.out.println("Translation: " + translation)) .exceptionally(ex -> { ex.printStackTrace(); return null; }); -``` +---- ---- +''''' -## BERT Encoding and Similarity +=== BERT Encoding and Similarity All BERT Java classes live in `sk.ainet.apps.bert.java`. -### Loading a BERT Model +==== Loading a BERT Model Load a BERT model from a HuggingFace directory containing `model.safetensors` and `vocab.txt`: -```java +[source,java] +---- import sk.ainet.apps.bert.java.KBertJava; import sk.ainet.apps.bert.java.KBertSession; import java.nio.file.Path; @@ -180,18 +189,20 @@ try (KBertSession bert = KBertJava.loadSafeTensors(Path.of("./bert-base-uncased/ float[] embedding = bert.encode("SKaiNET is a tensor framework"); System.out.println("Embedding dimension: " + embedding.length); } -``` +---- The directory must contain: -- `model.safetensors` -- BERT model weights -- `vocab.txt` -- WordPiece vocabulary -- `config.json` (optional) -- model config; defaults are used if absent -### Similarity Scoring +* `model.safetensors` -- BERT model weights +* `vocab.txt` -- WordPiece vocabulary +* `config.json` (optional) -- model config; defaults are used if absent + +==== Similarity Scoring Compute cosine similarity between two texts directly: -```java +[source,java] +---- try (KBertSession bert = KBertJava.loadSafeTensors(Path.of("./bert-base-uncased/"))) { float score = bert.similarity( "The cat sat on the mat", @@ -206,21 +217,22 @@ try (KBertSession bert = KBertJava.loadSafeTensors(Path.of("./bert-base-uncased/ ); System.out.printf("Unrelated: %.4f%n", low); // e.g. 0.1247 } -``` +---- -The returned value is cosine similarity in the range [-1, 1]. +The returned value is cosine similarity in the range ++[++-1, 1++]++. ---- +''''' -## Agent Loop and Tool Calling +=== Agent Loop and Tool Calling All agent/tool classes live in `sk.ainet.apps.kllama.chat.java`. The `JavaAgentLoop` lets the LLM call tools in a loop until it produces a final answer. You define tools by implementing the `JavaTool` interface. -### Defining a Tool +==== Defining a Tool -```java +[source,java] +---- import sk.ainet.apps.kllama.chat.java.JavaTool; import sk.ainet.apps.kllama.chat.ToolDefinition; import java.util.Map; @@ -255,11 +267,12 @@ public class CalculatorTool implements JavaTool { return 0.0; } } -``` +---- -### Building and Using the Agent +==== Building and Using the Agent -```java +[source,java] +---- import sk.ainet.apps.kllama.java.KLlamaJava; import sk.ainet.apps.kllama.java.KLlamaSession; import sk.ainet.apps.kllama.chat.java.JavaAgentLoop; @@ -285,24 +298,26 @@ try (KLlamaSession session = KLlamaJava.loadGGUF(Path.of("model.gguf"))) { // Reset conversation history (keeps system prompt) agent.reset(); } -``` +---- -### Streaming Agent Responses +==== Streaming Agent Responses -```java +[source,java] +---- String answer = agent.chat( "What is the square root of 144?", token -> System.out.print(token) ); -``` +---- ---- +''''' -## Resource Management +=== Resource Management Both `KLlamaSession` and `KBertSession` implement `AutoCloseable`. Always use `try-with-resources` to ensure off-heap memory arenas and other native resources are released promptly: -```java +[source,java] +---- // Single session try (KLlamaSession session = KLlamaJava.loadGGUF(path)) { session.generate("Hello"); @@ -315,23 +330,25 @@ try (KLlamaSession llama = KLlamaJava.loadGGUF(llamaPath); String text = llama.generate("Write a summary of quantum mechanics"); float[] embedding = bert.encode(text); } -``` +---- Failing to close sessions will leak off-heap memory allocated via `java.lang.foreign.Arena`. ---- +''''' -## Package Reference +=== Package Reference -| Package | Key Classes | -|----------------------------------------|---------------------------------------------| -| `sk.ainet.apps.kllama.java` | `KLlamaJava`, `KLlamaSession`, `GenerationConfig` | -| `sk.ainet.apps.bert.java` | `KBertJava`, `KBertSession` | -| `sk.ainet.apps.kllama.chat.java` | `JavaAgentLoop`, `JavaTool` | +[cols=",",options="header",] +|=== +|Package |Key Classes +|`sk.ainet.apps.kllama.java` |`KLlamaJava`, `KLlamaSession`, `GenerationConfig` +|`sk.ainet.apps.bert.java` |`KBertJava`, `KBertSession` +|`sk.ainet.apps.kllama.chat.java` |`JavaAgentLoop`, `JavaTool` +|=== ---- +''''' -## Next Steps +=== Next Steps -- [Java Getting Started](java-getting-started.md) -- tensor operations, project setup, and dependency management. -- [Model Training Guide](java-model-training.md) -- build and train neural networks from Java. +* link:java-getting-started.md[Java Getting Started] -- tensor operations, project setup, and dependency management. +* link:java-model-training.md[Model Training Guide] -- build and train neural networks from Java. diff --git a/docs/java-model-training.md b/docs/modules/ROOT/pages/how-to/java-model-training.adoc similarity index 80% rename from docs/java-model-training.md rename to docs/modules/ROOT/pages/how-to/java-model-training.adoc index 92e3e9cc..2abf7d17 100644 --- a/docs/java-model-training.md +++ b/docs/modules/ROOT/pages/how-to/java-model-training.adoc @@ -1,15 +1,16 @@ -# Java Model Training Guide +== Java Model Training Guide This guide covers building neural networks, defining loss functions and optimizers, loading datasets, and running training loops -- all from plain Java. -## Prerequisites +=== Prerequisites -- JDK 21+ with `--enable-preview --add-modules jdk.incubator.vector` -- See [Java Getting Started](java-getting-started.md) for project setup +* JDK 21{plus} with `--enable-preview --add-modules jdk.incubator.vector` +* See link:java-getting-started.md[Java Getting Started] for project setup -### Maven Dependencies +==== Maven Dependencies -```xml +[source,xml] +---- @@ -41,15 +42,16 @@ This guide covers building neural networks, defining loss functions and optimize skainet-data-simple-jvm -``` +---- ---- +''''' -## Building a Model with SequentialModelBuilder +=== Building a Model with SequentialModelBuilder `SequentialModelBuilder` provides a fluent API for stacking dense layers and activations. It lives in `sk.ainet.java`. -```java +[source,java] +---- import sk.ainet.java.SKaiNET; import sk.ainet.java.SequentialModelBuilder; import sk.ainet.lang.nn.Module; @@ -63,39 +65,43 @@ Module model = new SequentialModelBuilder(ctx) .relu() // ReLU activation .dense(10) // fully connected: 128 -> 10 (digit classes) .build(); -``` - -### Available Layers and Activations - -| Method | Description | -|-------------------------|------------------------------------------| -| `.input(size)` | Set the input dimension (must be first) | -| `.dense(outputSize)` | Fully connected (linear) layer | -| `.relu()` | ReLU activation: max(0, x) | -| `.sigmoid()` | Sigmoid activation | -| `.silu()` | SiLU / Swish activation: x * sigmoid(x) | -| `.gelu()` | GELU activation | -| `.softmax(dim)` | Softmax along a dimension (default: -1) | -| `.flatten(start, end)` | Flatten dimensions | +---- + +==== Available Layers and Activations + +[cols=",",options="header",] +|=== +|Method |Description +|`.input(size)` |Set the input dimension (must be first) +|`.dense(outputSize)` |Fully connected (linear) layer +|`.relu()` |ReLU activation: max(0, x) +|`.sigmoid()` |Sigmoid activation +|`.silu()` |SiLU / Swish activation: x ++*++ sigmoid(x) +|`.gelu()` |GELU activation +|`.softmax(dim)` |Softmax along a dimension (default: -1) +|`.flatten(start, end)` |Flatten dimensions +|=== Weights are initialized using Xavier initialization. The data type defaults to FP32; pass a `DType` to the constructor to change it: -```java +[source,java] +---- Module model = new SequentialModelBuilder(ctx, DType.fp16()) .input(784) .dense(256) .gelu() .dense(10) .build(); -``` +---- ---- +''''' -## Losses +=== Losses The `Losses` factory (in `sk.ainet.java`) creates loss function instances: -```java +[source,java] +---- import sk.ainet.java.Losses; import sk.ainet.lang.nn.loss.Loss; @@ -110,15 +116,16 @@ Loss hub = Losses.huber(1.0f); // Huber / Smooth L1 Loss hin = Losses.hinge(1.0f); // hinge loss Loss shin = Losses.squaredHinge(1.0f); // squared hinge Loss poi = Losses.poisson(); // Poisson NLL -``` +---- ---- +''''' -## Optimizers +=== Optimizers The `Optimizers` factory (in `sk.ainet.java`) creates optimizer instances: -```java +[source,java] +---- import sk.ainet.java.Optimizers; import sk.ainet.lang.nn.optim.Optimizer; @@ -136,15 +143,16 @@ Optimizer sgd = Optimizers.sgd(0.01, 0.9); // SGD with momentum and weight decay Optimizer sgdWd = Optimizers.sgd(0.01, 0.9, 0.0001); -``` +---- ---- +''''' -## TrainingLoop +=== TrainingLoop `TrainingLoop` ties together a model, loss function, optimizer, and execution context. Build it with the static builder: -```java +[source,java] +---- import sk.ainet.java.TrainingLoop; TrainingLoop loop = TrainingLoop.builder() @@ -153,22 +161,25 @@ TrainingLoop loop = TrainingLoop.builder() .optimizer(Optimizers.adam(0.001)) .context(ctx) .build(); -``` +---- -### Single Training Step +==== Single Training Step `step(x, y)` performs one forward pass, computes the loss, backpropagates, and updates weights. It returns the loss as a `float`: -```java +[source,java] +---- float loss = loop.step(inputBatch, targetBatch); System.out.printf("Step loss: %.4f%n", loss); -``` +---- -### Full Training with `.train()` +[[full-training-with-train]] +==== Full Training with `.train()` `train()` accepts a `Supplier` that produces an `Iterator` of `(input, target)` pairs for each epoch: -```java +[source,java] +---- import sk.ainet.java.TrainingResult; import kotlin.Pair; @@ -179,15 +190,17 @@ TrainingResult result = loop.train( System.out.printf("Trained %d epochs, final loss: %.4f%n", result.getEpochs(), result.getFinalLoss()); -``` +---- Each call to the supplier should return a fresh iterator over the training batches for that epoch. This allows reshuffling between epochs. -### Async Training with `.trainAsync()` +[[async-training-with-trainasync]] +==== Async Training with `.trainAsync()` -`trainAsync()` runs the training loop on a virtual thread and returns a `CompletableFuture`: +`trainAsync()` runs the training loop on a virtual thread and returns a `CompletableFuture++<++TrainingResult++>++`: -```java +[source,java] +---- import java.util.concurrent.CompletableFuture; CompletableFuture future = loop.trainAsync( @@ -199,23 +212,25 @@ CompletableFuture future = loop.trainAsync( TrainingResult result = future.join(); System.out.printf("Final loss: %.4f%n", result.getFinalLoss()); -``` +---- You can also compose the future: -```java +[source,java] +---- loop.trainAsync(() -> batches.iterator(), 10) .thenAccept(r -> System.out.println("Done! Loss: " + r.getFinalLoss())) .exceptionally(ex -> { ex.printStackTrace(); return null; }); -``` +---- ---- +''''' -## Loading MNIST Data +=== Loading MNIST Data The MNIST dataset loader lives in `sk.ainet.data.mnist`. The `MNISTBlocking` class provides blocking (non-suspend) methods for Java: -```java +[source,java] +---- import sk.ainet.data.mnist.MNISTBlocking; import sk.ainet.data.mnist.MNISTDataset; @@ -225,34 +240,37 @@ MNISTDataset test = MNISTBlocking.loadTest(); System.out.println("Training samples: " + train.getImages().size()); // 60000 System.out.println("Test samples: " + test.getImages().size()); // 10000 -``` +---- The first call downloads the dataset from the internet and caches it. Subsequent calls load from disk. -### Custom Cache Directory +==== Custom Cache Directory -```java +[source,java] +---- import sk.ainet.data.mnist.MNISTLoaderConfig; MNISTLoaderConfig config = new MNISTLoaderConfig("/tmp/my-mnist-cache", true); MNISTDataset train = MNISTBlocking.loadTrain(config); -``` +---- -### Working with MNIST Data +==== Working with MNIST Data -Each `MNISTDataset` contains a list of `MNISTImage` objects. Each image has a `byte[]` of 784 pixels (28x28) and a `byte` label (0-9): +Each `MNISTDataset` contains a list of `MNISTImage` objects. Each image has a `byte++[]++` of 784 pixels (28x28) and a `byte` label (0-9): -```java +[source,java] +---- var firstImage = train.getImages().get(0); byte label = firstImage.getLabel(); // e.g. 5 byte[] pixels = firstImage.getImage(); // 784 bytes, 0-255 -``` +---- -### Creating Tensor Batches +==== Creating Tensor Batches To feed MNIST data into the training loop, convert images to tensors: -```java +[source,java] +---- import sk.ainet.java.SKaiNET; import sk.ainet.lang.types.DType; import kotlin.Pair; @@ -287,15 +305,16 @@ for (int i = 0; i < images.size(); i += batchSize) { var y = SKaiNET.tensor(ctx, new int[]{actual}, DType.fp32(), yData); batches.add(new Pair<>(x, y)); } -``` +---- ---- +''''' -## Complete MNIST Training Example +=== Complete MNIST Training Example Putting it all together: -```java +[source,java] +---- package com.example; import sk.ainet.java.*; @@ -382,23 +401,25 @@ public class MnistTraining { return batches; } } -``` +---- Run with: -```bash +[source,bash] +---- java --enable-preview --add-modules jdk.incubator.vector \ -cp target/classes:target/dependency/* \ com.example.MnistTraining -``` +---- ---- +''''' -## Async Training Example +=== Async Training Example For non-blocking training, use `trainAsync()` and handle the result with `CompletableFuture`: -```java +[source,java] +---- var future = loop.trainAsync(() -> (Iterator) batches.iterator(), 10); // Monitor progress or do other work @@ -408,23 +429,25 @@ future.thenAccept(result -> { System.out.printf("Finished: %d epochs, loss %.4f%n", result.getEpochs(), result.getFinalLoss()); }).join(); -``` +---- ---- +''''' -## Package Reference +=== Package Reference -| Package | Key Classes | -|-----------------------|------------------------------------------------------| -| `sk.ainet.java` | `SKaiNET`, `SequentialModelBuilder`, `TrainingLoop`, `TrainingResult`, `Losses`, `Optimizers`, `TensorJavaOps` | -| `sk.ainet.data.mnist` | `MNISTBlocking`, `MNISTDataset`, `MNISTImage`, `MNISTLoaderConfig` | -| `sk.ainet.lang.types` | `DType` | -| `sk.ainet.lang.nn.loss` | `Loss` (interface returned by `Losses` factory) | -| `sk.ainet.lang.nn.optim` | `Optimizer` (interface returned by `Optimizers` factory) | +[cols=",",options="header",] +|=== +|Package |Key Classes +|`sk.ainet.java` |`SKaiNET`, `SequentialModelBuilder`, `TrainingLoop`, `TrainingResult`, `Losses`, `Optimizers`, `TensorJavaOps` +|`sk.ainet.data.mnist` |`MNISTBlocking`, `MNISTDataset`, `MNISTImage`, `MNISTLoaderConfig` +|`sk.ainet.lang.types` |`DType` +|`sk.ainet.lang.nn.loss` |`Loss` (interface returned by `Losses` factory) +|`sk.ainet.lang.nn.optim` |`Optimizer` (interface returned by `Optimizers` factory) +|=== ---- +''''' -## Next Steps +=== Next Steps -- [Java Getting Started](java-getting-started.md) -- tensor operations, project setup, and dependency management. -- [LLM Inference Guide](java-llm-inference.md) -- load GGUF/SafeTensors models, generate text, and build agents. +* link:java-getting-started.md[Java Getting Started] -- tensor operations, project setup, and dependency management. +* link:java-llm-inference.md[LLM Inference Guide] -- load GGUF/SafeTensors models, generate text, and build agents. diff --git a/docs/modules/ROOT/pages/index.adoc b/docs/modules/ROOT/pages/index.adoc new file mode 100644 index 00000000..fd1fda7e --- /dev/null +++ b/docs/modules/ROOT/pages/index.adoc @@ -0,0 +1,34 @@ += SKaiNET +:description: Kotlin Multiplatform tensor engine with a graph IR, pluggable backends, and StableHLO export. + +SKaiNET is a Kotlin Multiplatform tensor / compile / graph engine. +It provides a tensor DSL, execution contexts, a graph IR, model +loaders (GGUF, SafeTensors, ONNX), quantization primitives +(Q4_K, Q8_0, ternary, TurboQuant), a StableHLO emitter for cross- +platform compile targets, and a pluggable backend API that CPU, +GPU, and NPU backends can implement independently. + +This documentation site is organized following the +https://diataxis.fr/[DiΓ‘taxis / Divio framework]: + +Tutorials:: Learning-oriented. Start here if you are new to SKaiNET. +How-to guides:: Task-oriented. Recipes for solving specific problems. +Reference:: Information-oriented. Looking up APIs and op coverage. +Explanation:: Understanding-oriented. Background on design decisions. + +[NOTE] +==== +LLM-specific runtimes (Llama, Gemma, Qwen, BERT) live in the +sibling https://github.com/SKaiNET-developers/SKaiNET-transformers[SKaiNET-transformers] +repository and its own documentation site. This site covers the +engine layer only. +==== + +== Quick links + +* link:../api/index.html[API reference (Dokka)] (bundled at publish time) + +// The Tutorials / How-to / Reference / Explanation pages plus the +// operator coverage xref land in follow-up commits (#2 and #3 of +// the Antora migration). This page ships the landing copy first so +// the scaffold build succeeds with a real start_page. diff --git a/docs/modules/ROOT/pages/reference/api.adoc b/docs/modules/ROOT/pages/reference/api.adoc new file mode 100644 index 00000000..ba400d1f --- /dev/null +++ b/docs/modules/ROOT/pages/reference/api.adoc @@ -0,0 +1,19 @@ += API Reference +:description: Kotlin API reference generated by Dokka. + +The full Kotlin API reference for every SKaiNET module is +generated by https://kotlinlang.org/docs/dokka-introduction.html[Dokka] +and published as a sibling path of this documentation site. + +link:../api/index.html[Open the Dokka API reference, window=_blank] + +[NOTE] +==== +The Dokka output is bundled into the published site by a +`bundleDokkaIntoSite` Gradle task that runs **after** Antora +writes the site. When you preview the site locally via +`docker run ... antora ... docs/antora-playbook.yml`, the +`/api/` path does not yet exist β€” run +`./gradlew bundleDokkaIntoSite` to populate it before clicking +through. +==== diff --git a/docs/modules/ROOT/pages/reference/architecture.adoc b/docs/modules/ROOT/pages/reference/architecture.adoc new file mode 100644 index 00000000..d350b26f --- /dev/null +++ b/docs/modules/ROOT/pages/reference/architecture.adoc @@ -0,0 +1,11 @@ += Architecture +:description: How SKaiNET's compile and execution layers are organized. + +SKaiNET uses a hybrid backend strategy that separates development +iteration from production deployment. + +image::SKaiNET-compiler.svg[Architecture diagram of the SKaiNET compiler pipeline] + +// The original ARCHITECTURE.md at the repo root was a 4-line stub +// pointing at the compiler diagram. If you are looking for a +// deeper architecture write-up, contribute it as a PR to this page. diff --git a/docs/modules/ROOT/pages/reference/operators/generated/index.adoc b/docs/modules/ROOT/pages/reference/operators/generated/index.adoc new file mode 100644 index 00000000..e64fe818 --- /dev/null +++ b/docs/modules/ROOT/pages/reference/operators/generated/index.adoc @@ -0,0 +1,14 @@ += AI-NET Operators Reference + +Generated from version `1.0.0` on 2026-04-13 + +== Operators by Modality + +=== Core + +* xref:reference/operators/generated/voidtensorops.adoc[VoidTensorOps] + +=== Composite + +* xref:reference/operators/generated/similarity.adoc[Similarity] + diff --git a/docs/modules/operators/_generated_/similarity.adoc b/docs/modules/ROOT/pages/reference/operators/generated/similarity.adoc similarity index 100% rename from docs/modules/operators/_generated_/similarity.adoc rename to docs/modules/ROOT/pages/reference/operators/generated/similarity.adoc diff --git a/docs/modules/operators/_generated_/voidtensorops.adoc b/docs/modules/ROOT/pages/reference/operators/generated/voidtensorops.adoc similarity index 100% rename from docs/modules/operators/_generated_/voidtensorops.adoc rename to docs/modules/ROOT/pages/reference/operators/generated/voidtensorops.adoc diff --git a/docs/modules/ROOT/pages/reference/ops-status-matrix.adoc b/docs/modules/ROOT/pages/reference/ops-status-matrix.adoc new file mode 100644 index 00000000..6ee3957d --- /dev/null +++ b/docs/modules/ROOT/pages/reference/ops-status-matrix.adoc @@ -0,0 +1,19 @@ += Operator Coverage Matrix +:description: Cross-backend status for every operator function in SKaiNET. + +Generated from `operators.json` version `1.0.0` on 2026-04-13. + +Rows are `Operator.function` pairs; columns are backends that appear in any function's `statusByBackend` map. A missing entry means the backend makes no claim about the function β€” treat it as "unknown", not "not supported". + +[cols="2,1,1,1,1", options="header"] +|=== +| Operator.function | Metal | apple | cpu | wasm + +| `VoidTensorOps.matmul` | 🚧 | β€” | β€” | β€” +| `VoidTensorOps.transpose` | 🚧 | β€” | β€” | β€” +| `Similarity.cosineDistance` | β€” | βœ… | βœ… | βœ… + +| *Done* | *0 / 3* | *1 / 3* | *1 / 3* | *1 / 3* +|=== + +Per-function detail including notes lives in xref:reference/operators/generated/index.adoc[Operator reference]. diff --git a/docs/graph-dsl.md b/docs/modules/ROOT/pages/tutorials/graph-dsl.adoc similarity index 77% rename from docs/graph-dsl.md rename to docs/modules/ROOT/pages/tutorials/graph-dsl.adoc index 8c65ac17..3112dfc5 100644 --- a/docs/graph-dsl.md +++ b/docs/modules/ROOT/pages/tutorials/graph-dsl.adoc @@ -1,12 +1,13 @@ -# SKaiNET Graph DSL +== SKaiNET Graph DSL The SKaiNET Graph DSL provides a way to define complex directed acyclic graphs (DAGs) for machine learning models. Unlike the sequential `nn` DSL, the `dag` DSL allows for arbitrary wiring of nodes, multi-output graphs, and reusable modules. -## Basic Usage +=== Basic Usage To define a graph, use the `dag` block: -```kotlin +[source,kotlin] +---- val program = dag { val x = input("input", TensorSpec("input", listOf(1, 3, 224, 224), "FP32")) @@ -18,33 +19,35 @@ val program = dag { output(activated) } -``` +---- -## Key Concepts +=== Key Concepts -### Inputs, Parameters, and Constants +==== Inputs, Parameters, and Constants -- `input(name, spec)`: Defines an input node for the graph. -- `parameter(name) { ... }`: Defines a learnable parameter node. You can use a builder to specify shape and initialization. -- `constant(name) { ... }`: Defines a constant node (e.g., fixed biases or weights). +* `input++<++T++>++(name, spec)`: Defines an input node for the graph. +* `parameter++<++T, V++>++(name) ++{++ ... }`: Defines a learnable parameter node. You can use a builder to specify shape and initialization. +* `constant++<++T, V++>++(name) ++{++ ... }`: Defines a constant node (e.g., fixed biases or weights). -### Operations +==== Operations Standard operations like `conv2d`, `relu`, `matmul`, `add`, etc., are available as extension functions within the `DagBuilder` (operations are in sync with TensorOps and implemented extention method via KSP). -### Outputs +==== Outputs A graph can have one or more outputs, defined using the `output()` function. -```kotlin +[source,kotlin] +---- output(branch1, branch2) -``` +---- -## Reusable Modules +=== Reusable Modules You can define reusable graph components using `dagModule`: -```kotlin +[source,kotlin] +---- val residualBlock = dagModule { inputs -> val x = inputs[0] val conv1 = conv2d(x, w1, b1, padding = 1 to 1) @@ -59,25 +62,27 @@ val program = dag { val out = module(residualBlock, listOf(x)) output(out[0]) } -``` +---- -## Compiling and Validating +=== Compiling and Validating Once a `GraphProgram` is built, it can be converted to a `ComputeGraph` for execution or compilation: -```kotlin +[source,kotlin] +---- val graph = program.toComputeGraph() val validation = graph.validate() if (validation is ValidationResult.Valid) { // proceed to execution or compilation } -``` +---- -## YOLO-style Example +=== YOLO-style Example The Graph DSL is particularly useful for complex architectures like YOLO heads: -```kotlin +[source,kotlin] +---- val program = dag { val input = input("input", TensorSpec("input", listOf(1, 3, 640, 640), "FP32")) @@ -90,4 +95,4 @@ val program = dag { output(c2, head) // Multi-scale outputs } -``` +---- diff --git a/docs/hlo-getting-started.md b/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc similarity index 59% rename from docs/hlo-getting-started.md rename to docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc index c0a116a4..d7d47a92 100644 --- a/docs/hlo-getting-started.md +++ b/docs/modules/ROOT/pages/tutorials/hlo-getting-started.adoc @@ -1,34 +1,35 @@ -# Getting Started with HLO in SKaiNET +== Getting Started with HLO in SKaiNET -## What is HLO? +=== What is HLO? -HLO (High-Level Operations) is SKaiNET's intermediate representation for neural network computations, based on [StableHLO](https://github.com/openxla/stablehlo) - the portable high-level operation set for machine learning. HLO serves as a bridge between SKaiNET's Kotlin DSL and various execution backends, enabling optimizations and cross-platform deployment. +HLO (High-Level Operations) is SKaiNET's intermediate representation for neural network computations, based on https://github.com/openxla/stablehlo[StableHLO] - the portable high-level operation set for machine learning. HLO serves as a bridge between SKaiNET's Kotlin DSL and various execution backends, enabling optimizations and cross-platform deployment. -## Why MLIR/XLA Instead of Direct Backends? +=== Why MLIR/XLA Instead of Direct Backends? SKaiNET uses the MLIR/XLA compilation approach rather than implementing separate backends for each hardware target. This design choice provides several key advantages: -**Single Implementation Path**: Write operations once in Kotlin, compile to StableHLO MLIR, then let XLA handle hardware-specific optimizations. No need to maintain separate CUDA, Metal, or ROCm implementations. +*Single Implementation Path*: Write operations once in Kotlin, compile to StableHLO MLIR, then let XLA handle hardware-specific optimizations. No need to maintain separate CUDA, Metal, or ROCm implementations. -**Automatic Optimization**: XLA provides sophisticated optimizations like operator fusion, memory layout optimization, and hardware-specific kernel selection without manual tuning. +*Automatic Optimization*: XLA provides sophisticated optimizations like operator fusion, memory layout optimization, and hardware-specific kernel selection without manual tuning. -**Future-Proof**: New hardware targets (like future GPU architectures) are automatically supported when XLA adds support, without requiring SKaiNET updates. +*Future-Proof*: New hardware targets (like future GPU architectures) are automatically supported when XLA adds support, without requiring SKaiNET updates. -**Ecosystem Integration**: Full compatibility with JAX, TensorFlow, and other MLIR-based frameworks enables model sharing and toolchain reuse. +*Ecosystem Integration*: Full compatibility with JAX, TensorFlow, and other MLIR-based frameworks enables model sharing and toolchain reuse. -### Key Benefits +==== Key Benefits -- **Portability**: Write once, compile to any XLA-supported hardware (CPU, GPU, TPU) -- **Optimization**: Leverage XLA's advanced compiler optimizations and operator fusion -- **Interoperability**: Full compatibility with XLA, JAX, TensorFlow, and MLIR ecosystems -- **Performance**: Hardware-specific optimizations without manual kernel development -- **No Backend Lock-in**: Single compilation target supports all hardware through XLA +* *Portability*: Write once, compile to any XLA-supported hardware (CPU, GPU, TPU) +* *Optimization*: Leverage XLA's advanced compiler optimizations and operator fusion +* *Interoperability*: Full compatibility with XLA, JAX, TensorFlow, and MLIR ecosystems +* *Performance*: Hardware-specific optimizations without manual kernel development +* *No Backend Lock-in*: Single compilation target supports all hardware through XLA -## Architecture Overview +=== Architecture Overview SKaiNET's HLO compilation pipeline transforms high-level Kotlin DSL operations into hardware-optimized executable code through the MLIR/XLA ecosystem: -```mermaid +[mermaid] +---- graph TD A[Kotlin DSL] --> B[Compute Graph] B --> C[HLO Converter] @@ -58,11 +59,12 @@ graph TD style A fill:#e1f5fe style D fill:#f3e5f5 style F fill:#e8f5e8 -``` +---- -### Data Flow Architecture +==== Data Flow Architecture -```mermaid +[mermaid] +---- flowchart LR subgraph "Input Layer" DSL[Kotlin DSL Code] @@ -92,48 +94,53 @@ flowchart LR style DSL fill:#bbdefb style Conv fill:#c8e6c9 style MLIR fill:#ffcdd2 -``` +---- -## Building Blocks +=== Building Blocks -### 1. HLO Converters +[[1-hlo-converters]] +==== 1. HLO Converters Converters transform SKaiNET operations into StableHLO operations: -- **MathOperationsConverter**: Basic arithmetic operations -- **LinalgOperationsConverter**: Linear algebra operations -- **ActivationOperationsConverter**: Neural network activations -- **NeuralNetOperationsConverter**: High-level NN operations -- **ConstantOperationsConverter**: Constant value operations +* *MathOperationsConverter*: Basic arithmetic operations +* *LinalgOperationsConverter*: Linear algebra operations +* *ActivationOperationsConverter*: Neural network activations +* *NeuralNetOperationsConverter*: High-level NN operations +* *ConstantOperationsConverter*: Constant value operations -### 2. Type System +[[2-type-system]] +==== 2. Type System HLO uses a strict type system for tensors: -```kotlin +[source,kotlin] +---- // SKaiNET tensor type Tensor // Batch, Channel, Height, Width // Converts to HLO type tensor<1x3x224x224xf32> // StableHLO representation -``` +---- -### 3. Optimization Framework +[[3-optimization-framework]] +==== 3. Optimization Framework The optimization pipeline includes: -- **Shape inference and propagation** -- **Constant folding and dead code elimination** -- **Operation fusion for performance** -- **Memory layout optimization** +* *Shape inference and propagation* +* *Constant folding and dead code elimination* +* *Operation fusion for performance* +* *Memory layout optimization* -## Practical Example: RGB to Grayscale Conversion +=== Practical Example: RGB to Grayscale Conversion -Let's walk through converting a color image tensor `Tensor` to grayscale using matrix multiplication. +Let's walk through converting a color image tensor `Tensor++<++B,C,H,W++>++` to grayscale using matrix multiplication. -### Step 1: Define the Operation in Kotlin DSL +==== Step 1: Define the Operation in Kotlin DSL -```kotlin +[source,kotlin] +---- // From: skainet-lang/skainet-lang-models/src/commonMain/kotlin/sk/ainet/lang/model/compute/Rgb2GrayScaleMultiply.kt fun Tensor.rgb2GrayScaleMatMul(): Tensor { // RGB to grayscale weights: [0.299, 0.587, 0.114] @@ -151,13 +158,14 @@ fun Tensor.rgb2GrayScaleMatMul(): Tensor { // Reshape back to [B,1,H,W] return gray.transpose(intArrayOf(0, 3, 1, 2)) } -``` +---- -### Step 2: HLO Conversion Process +==== Step 2: HLO Conversion Process The conversion pipeline transforms this operation: -```mermaid +[mermaid] +---- sequenceDiagram participant DSL as Kotlin DSL participant DAG as Compute Graph @@ -173,13 +181,14 @@ sequenceDiagram Opt->>HLO: Optimized IR Note over Conv,HLO: Type inference:
tensor β†’ tensor -``` +---- -### Step 3: Generated StableHLO IR +==== Step 3: Generated StableHLO IR The converter produces MLIR code like this: -```mlir +[source,mlir] +---- func.func @rgb2grayscale(%input: tensor) -> tensor { // Define grayscale conversion weights %weights = stablehlo.constant dense<[[0.299], [0.587], [0.114]]> : tensor<3x1xf32> @@ -199,28 +208,30 @@ func.func @rgb2grayscale(%input: tensor) -> tensor { return %result : tensor } -``` +---- -## Hardware Target Compilation via XLA +=== Hardware Target Compilation via XLA SKaiNET uses the MLIR/XLA compilation pipeline to target different hardware platforms without requiring separate backend implementations. The StableHLO IR serves as a portable intermediate representation that XLA can compile to optimized code for various targets. -### Supported Hardware Targets +==== Supported Hardware Targets -- **CPU**: x86_64, ARM64 (via XLA CPU backend) -- **GPU**: NVIDIA CUDA, AMD ROCm (via XLA GPU backend) -- **TPU**: Google TPUs (via XLA TPU backend) -- **Mobile**: iOS Metal, Android GPU (via XLA mobile backends) +* *CPU*: x86++_++64, ARM64 (via XLA CPU backend) +* *GPU*: NVIDIA CUDA, AMD ROCm (via XLA GPU backend) +* *TPU*: Google TPUs (via XLA TPU backend) +* *Mobile*: iOS Metal, Android GPU (via XLA mobile backends) -### Prerequisites for GPU Compilation +==== Prerequisites for GPU Compilation -1. **XLA with GPU support**: [Installation guide](https://www.tensorflow.org/xla/tutorials/compile) -2. **NVIDIA CUDA Toolkit** (for NVIDIA GPUs): [Download here](https://developer.nvidia.com/cuda-downloads) -3. **ROCm** (for AMD GPUs): [Installation guide](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html) +[arabic] +. *XLA with GPU support*: https://www.tensorflow.org/xla/tutorials/compile[Installation guide] +. *NVIDIA CUDA Toolkit* (for NVIDIA GPUs): https://developer.nvidia.com/cuda-downloads[Download here] +. *ROCm* (for AMD GPUs): https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html[Installation guide] -### Step 1: Generate StableHLO IR +==== Step 1: Generate StableHLO IR -```bash +[source,bash] +---- # Build SKaiNET HLO compiler ./gradlew :skainet-compile:skainet-compile-hlo:build @@ -228,11 +239,12 @@ SKaiNET uses the MLIR/XLA compilation pipeline to target different hardware plat ./gradlew :skainet-compile:skainet-compile-hlo:generateHlo \ -Pmodel=rgb2grayscale \ -Poutput=rgb2grayscale.mlir -``` +---- -### Step 2: Compile with XLA for Target Hardware +==== Step 2: Compile with XLA for Target Hardware -```bash +[source,bash] +---- # Compile to GPU executable (NVIDIA CUDA) xla_compile \ --input_format=mlir \ @@ -257,24 +269,26 @@ xla_compile \ --platform=tpu \ --input_file=rgb2grayscale.mlir \ --output_file=rgb2grayscale_tpu.so -``` +---- -### Step 3: Runtime Execution +==== Step 3: Runtime Execution -```bash +[source,bash] +---- # Execute on target hardware using XLA runtime xla_run \ --executable=rgb2grayscale_cuda.so \ --input=image.jpg \ --output=gray.jpg \ --device=gpu:0 -``` +---- -### Jetson and Edge Device Deployment +==== Jetson and Edge Device Deployment For NVIDIA Jetson and other edge devices, the same MLIR β†’ XLA compilation approach applies: -```bash +[source,bash] +---- # Cross-compile for ARM64 with CUDA support xla_compile \ --input_format=mlir \ @@ -292,15 +306,16 @@ scp rgb2grayscale_jetson.so jetson@192.168.1.100:~/models/ ssh jetson@192.168.1.100 cd ~/models xla_run --executable=rgb2grayscale_jetson.so --device=gpu:0 -``` +---- -## Advanced Topics +=== Advanced Topics -### Custom HLO Operations +==== Custom HLO Operations Extend SKaiNET with custom operations: -```kotlin +[source,kotlin] +---- // Define custom operation @HloOperation("custom.rgb_enhance") class RgbEnhanceOp : HloConverter { @@ -311,34 +326,36 @@ class RgbEnhanceOp : HloConverter { """ } } -``` +---- -### Debugging HLO +==== Debugging HLO Use SKaiNET's built-in debugging tools: -```kotlin +[source,kotlin] +---- // Enable HLO debugging val optimizer = StableHloOptimizer(debugMode = true) val optimizedHlo = optimizer.optimize(hloModule) // Visualize computation graph optimizer.dumpGraphviz("rgb2gray.dot") -``` +---- -## Resources and References +=== Resources and References -- [StableHLO Specification](https://github.com/openxla/stablehlo/blob/main/docs/spec.md) -- [MLIR Documentation](https://mlir.llvm.org/docs/) -- [XLA Compilation Guide](https://www.tensorflow.org/xla) -- [NVIDIA Jetson Documentation](https://docs.nvidia.com/jetson/) -- [SKaiNET HLO Examples](./examples/hlo/) +* https://github.com/openxla/stablehlo/blob/main/docs/spec.md[StableHLO Specification] +* https://mlir.llvm.org/docs/[MLIR Documentation] +* https://www.tensorflow.org/xla[XLA Compilation Guide] +* https://docs.nvidia.com/jetson/[NVIDIA Jetson Documentation] +* link:./examples/hlo/[SKaiNET HLO Examples] -## Next Steps +=== Next Steps -1. **Explore Examples**: Check `skainet-compile/skainet-compile-hlo/src/commonMain/kotlin/sk/ainet/compile/hlo/examples/` -2. **Run Tests**: Execute `./gradlew :skainet-compile:skainet-compile-hlo:test` -3. **Contribute**: Add new HLO converters for missing operations -4. **Optimize**: Profile and optimize your models using HLO tools +[arabic] +. *Explore Examples*: Check `skainet-compile/skainet-compile-hlo/src/commonMain/kotlin/sk/ainet/compile/hlo/examples/` +. *Run Tests*: Execute `./gradlew :skainet-compile:skainet-compile-hlo:test` +. *Contribute*: Add new HLO converters for missing operations +. *Optimize*: Profile and optimize your models using HLO tools -For more detailed information, see the [HLO Optimization Guide](./OPTIMIZATION.md) and [API Documentation](https://docs.skainet.sk/hlo/). \ No newline at end of file +For more detailed information, see the link:./OPTIMIZATION.md[HLO Optimization Guide] and https://docs.skainet.sk/hlo/[API Documentation]. diff --git a/docs/java-getting-started.md b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc similarity index 78% rename from docs/java-getting-started.md rename to docs/modules/ROOT/pages/tutorials/java-getting-started.adoc index e64be280..003a6d46 100644 --- a/docs/java-getting-started.md +++ b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc @@ -1,31 +1,33 @@ -# Java Getting Started Guide +== Java Getting Started Guide This guide gets you from zero to running tensor operations with SKaiNET in under 5 minutes. SKaiNET is a Kotlin Multiplatform AI framework, but every JVM-facing API is designed for idiomatic Java usage -- no Kotlin knowledge required. -## Prerequisites +=== Prerequisites -- **JDK 21 or later** (required for Vector API and virtual threads) -- **Maven 3.8+** or **Gradle 8.4+** +* *JDK 21 or later* (required for Vector API and virtual threads) +* *Maven 3.8{plus}* or *Gradle 8.4{plus}* -## JVM Flags +=== JVM Flags SKaiNET uses the Java Vector API for SIMD-accelerated tensor operations. You must pass two flags every time you run your application: -``` +.... --enable-preview --add-modules jdk.incubator.vector -``` +.... -For Maven Surefire / exec-maven-plugin, add them to ``. For Gradle, add them to `jvmArgs` in your run task. Examples are shown below. +For Maven Surefire / exec-maven-plugin, add them to `++<++jvmArgs++>++`. For Gradle, add them to `jvmArgs` in your run task. Examples are shown below. ---- +''''' -## Maven Setup +=== Maven Setup -### 1. Import the BOM +[[1-import-the-bom]] +==== 1. Import the BOM -The `skainet-bom` manages all SKaiNET module versions so you never have to keep them in sync manually. Add it to your `` section: +The `skainet-bom` manages all SKaiNET module versions so you never have to keep them in sync manually. Add it to your `++<++dependencyManagement++>++` section: -```xml +[source,xml] +---- 0.13.0 @@ -76,13 +78,15 @@ The `skainet-bom` manages all SKaiNET module versions so you never have to keep -``` +---- -### 2. Add More Modules as Needed +[[2-add-more-modules-as-needed]] +==== 2. Add More Modules as Needed Because the BOM is imported, you can add any module without repeating the version: -```xml +[source,xml] +---- sk.ainet @@ -106,13 +110,14 @@ Because the BOM is imported, you can add any module without repeating the versio sk.ainet skainet-kllama-agent-jvm -``` +---- ---- +''''' -## Gradle Kotlin DSL Setup +=== Gradle Kotlin DSL Setup -```kotlin +[source,kotlin] +---- plugins { java application @@ -144,15 +149,16 @@ application { tasks.withType { options.compilerArgs.addAll(listOf("--enable-preview")) } -``` +---- ---- +''''' -## Hello Tensor +=== Hello Tensor Create `src/main/java/com/example/HelloTensor.java`: -```java +[source,java] +---- package com.example; import sk.ainet.java.SKaiNET; @@ -186,39 +192,43 @@ public class HelloTensor { System.out.println("after relu: " + d); } } -``` +---- Run it: -```bash +[source,bash] +---- # Maven mvn compile exec:java # Gradle ./gradlew run -``` +---- ---- +''''' -## Key Entry Points +=== Key Entry Points All Java-facing classes live in the `sk.ainet.java` package: -| Class | Purpose | -|-------------------|--------------------------------------------------------| -| `SKaiNET` | Static factory -- `context()`, `tensor()`, `zeros()`, `ones()`, `randn()`, `full()` | -| `TensorJavaOps` | Static tensor ops -- `matmul()`, `relu()`, `softmax()`, `add()`, `reshape()`, ... | -| `Losses` | Loss function factory -- `crossEntropy()`, `mse()`, `binaryCrossEntropy()`, ... | -| `Optimizers` | Optimizer factory -- `adam()`, `adamw()`, `sgd()` | -| `DType` | Data type selectors -- `DType.fp32()`, `DType.fp16()`, `DType.bf16()`, `DType.int32()`, ... | +[cols=",",options="header",] +|=== +|Class |Purpose +|`SKaiNET` |Static factory -- `context()`, `tensor()`, `zeros()`, `ones()`, `randn()`, `full()` +|`TensorJavaOps` |Static tensor ops -- `matmul()`, `relu()`, `softmax()`, `add()`, `reshape()`, ... +|`Losses` |Loss function factory -- `crossEntropy()`, `mse()`, `binaryCrossEntropy()`, ... +|`Optimizers` |Optimizer factory -- `adam()`, `adamw()`, `sgd()` +|`DType` |Data type selectors -- `DType.fp32()`, `DType.fp16()`, `DType.bf16()`, `DType.int32()`, ... +|=== ---- +''''' -## Data Types +=== Data Types Access data types through static methods on `DType` (from `sk.ainet.lang.types`): -```java +[source,java] +---- import sk.ainet.lang.types.DType; DType f32 = DType.fp32(); // 32-bit float (default) @@ -229,15 +239,16 @@ DType i8 = DType.int8(); // 8-bit integer DType i32 = DType.int32(); // 32-bit integer DType i64 = DType.int64(); // 64-bit integer DType u8 = DType.uint8(); // unsigned 8-bit -``` +---- -You can also use the constant fields if you prefer: `DType.FP32_TYPE`, `DType.INT32_TYPE`, etc. +You can also use the constant fields if you prefer: `DType.FP32++_++TYPE`, `DType.INT32++_++TYPE`, etc. ---- +''''' -## Common Tensor Operations +=== Common Tensor Operations -```java +[source,java] +---- var ctx = SKaiNET.context(); // Creation @@ -277,11 +288,11 @@ var flat = TensorJavaOps.flatten(a); var resh = TensorJavaOps.reshape(a, new int[]{1, -1}); var sq = TensorJavaOps.squeeze(a, 0); var usq = TensorJavaOps.unsqueeze(a, 0); -``` +---- ---- +''''' -## Next Steps +=== Next Steps -- [LLM Inference Guide](java-llm-inference.md) -- load GGUF/SafeTensors models, generate text, run BERT embeddings, and build tool-calling agents. -- [Model Training Guide](java-model-training.md) -- build sequential models, train on MNIST, and run async training loops. +* link:java-llm-inference.md[LLM Inference Guide] -- load GGUF/SafeTensors models, generate text, run BERT embeddings, and build tool-calling agents. +* link:java-model-training.md[Model Training Guide] -- build sequential models, train on MNIST, and run async training loops. diff --git a/docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc new file mode 100644 index 00000000..153d32ef --- /dev/null +++ b/docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc @@ -0,0 +1,26 @@ +== KLlama Getting Started + +KLlama is a pure Kotlin LLaMA inference runtime that runs on JVM, Native, JS, and WebAssembly. It supports GGUF, SafeTensors, and Karpathy .bin model formats with on-the-fly quantization support. + +____ +*Early Stage Development*: The project is in active development. We appreciate your feedback and bug reports! +____ + +=== Choose Your Path + +[cols=",",options="header",] +|=== +|Goal |Guide +|*Run models from the command line* |link:../skainet-apps/skainet-kllama-cli/README.md[KLlama CLI] +|*Embed in a Kotlin application* |link:../skainet-apps/skainet-kllama/README.md[KLlama Library] +|*Embed in a Java application* |link:java-llm-inference.md[Java LLM Inference Guide] +|*Build a standalone Java CLI app* |link:java-cli-app.md[Java CLI App Guide] +|*Java project setup (Maven / Gradle)* |link:java-getting-started.md[Java Getting Started] +|=== + +=== Quick Links + +* link:++../skainet-apps/skainet-kllama/README.md#supported-formats--quantization++[Supported formats & quantization] +* link:../skainet-apps/skainet-kllama/README.md#custom-backend-integration[Custom backend integration] +* link:java-llm-inference.md#agent-loop-and-tool-calling[Agent & tool calling] +* link:java-llm-inference.md#bert-encoding-and-similarity[BERT embeddings & similarity] diff --git a/docs/modules/operators/_generated_/index.adoc b/docs/modules/operators/_generated_/index.adoc deleted file mode 100644 index e172df10..00000000 --- a/docs/modules/operators/_generated_/index.adoc +++ /dev/null @@ -1,14 +0,0 @@ -= AI-NET Operators Reference - -Generated from version `1.0.0` on 2026-03-03 - -== Operators by Modality - -=== Core - -* xref:voidtensorops.adoc[VoidTensorOps] - -=== Composite - -* xref:similarity.adoc[Similarity] - diff --git a/docs/nav.adoc b/docs/nav.adoc deleted file mode 100644 index f23df7ae..00000000 --- a/docs/nav.adoc +++ /dev/null @@ -1,50 +0,0 @@ -= SKaiNET Documentation Navigation - -[#main-nav] -== Main Navigation - -* xref:theory/index.adoc[Mathematical Theory] -** xref:theory/matmul.adoc[Matrix Multiplication] -* xref:examples/index.adoc[Usage Examples] -** xref:examples/matmul-examples.adoc[Matrix Multiplication Examples] -* xref:modules/operators/_generated_/index.adoc[Generated API Reference] - -[#quick-reference] -== Quick Reference - -=== Core Operations -* xref:theory/matmul.adoc#matmul-definition[Matrix Multiplication Theory] -* xref:examples/matmul-examples.adoc#basic-usage[Basic Matrix Multiplication] -* xref:examples/matmul-examples.adoc#neural-network[Neural Network Applications] - -=== Documentation Structure -* `docs/theory/` - Mathematical definitions and theoretical foundations -* `docs/examples/` - Practical usage examples and code samples -* `docs/modules/operators/_generated_/` - Auto-generated API reference - -[#toc-template] -== Table of Contents Template - -The following template can be used for generating table of contents in documentation pages: - ----- -[discrete] -== Table of Contents - -* <> -** <> -* <> ----- - -[#cross-reference-patterns] -== Cross-Reference Patterns - -=== Internal Links -* Theory to Examples: `xref:../examples/matmul-examples.adoc#basic-usage[Matrix Multiplication Examples]` -* Examples to Theory: `xref:../theory/matmul.adoc#matmul-definition[Mathematical Definition]` -* Generated to Human: `xref:../../theory/index.adoc[Theory Reference]` - -=== Anchor Naming Conventions -* Theory anchors: `#operation-definition`, `#operation-properties`, `#operation-complexity` -* Example anchors: `#basic-usage`, `#advanced-usage`, `#performance-tips` -* Generated anchors: `#operator-{name}`, `#function-{operator}-{function}` \ No newline at end of file diff --git a/docs/perf/java-25-cpu-backend.md b/docs/perf/java-25-cpu-backend.md deleted file mode 100644 index e66f588d..00000000 --- a/docs/perf/java-25-cpu-backend.md +++ /dev/null @@ -1,99 +0,0 @@ -### Java 25 Advantages for the JVM CPU Backend - -Java 25 (GA September 2025) delivers significant free performance improvements to the -SKaiNET JVM CPU backend through JIT/C2 optimizations, faster Panama FFI, and new GC/startup -features β€” all without requiring code changes. - -#### Compatibility - -The same code, same flags, and same runtime detection work across JDK 21–25: - -- Vector API remains incubator on JDK 25 (JEP 508) β€” identical `jdk.incubator.vector` package. -- Panama FFI finalized in JDK 22; `--enable-preview` is harmless on 22+. -- Runtime detection (`Class.forName`, `Runtime.version()`) works on all versions. -- Build config (`jvmTarget = JVM_21`, `options.release.set(21)`) produces compatible bytecode. - -**No special treatment is needed for JDK >= 21 but < 25.** - -Required flags remain: -``` ---enable-preview --add-modules jdk.incubator.vector -``` - -#### JIT / C2 improvements mapped to SKaiNET ops - -These are automatic β€” the JIT produces better native code for existing bytecode. - -| Improvement | JDK bug | Speedup | Affected SKaiNET code | -|---|---|---|---| -| VPointer refactoring for vector loads/stores | [JDK-8350748](https://bugs.openjdk.org/browse/JDK-8350748) | up to 14x | All `FloatVector.fromArray` / `fromMemorySegment` loops in `JvmVectorKernels.kt`, `JvmQuantizedVectorKernels.kt` | -| SuperWord SIMD enhancement | [JDK-8343685](https://bugs.openjdk.org/browse/JDK-8343685) | up to 33x | Same vectorized loops (elementwise, reductions, matmul inner loops) | -| `Math.max` / `Math.min` intrinsified for `long` | JDK-8350485 | 3–5x | Shape computation, tile clamping in blocked matmul | - -Source files: -- `skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmVectorKernels.kt` -- `skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt` - -#### Panama FFI improvements - -| Improvement | JDK bug | Speedup | Affected SKaiNET code | -|---|---|---|---| -| Faster `MemorySegment` allocation | [JDK-8345687](https://bugs.openjdk.org/browse/JDK-8345687) | ~2x | `MemorySegmentTensorData.kt` (`MemorySegmentTensorDataFactory`), `PagedKvCache.kt` | -| `MemorySegment::fill` optimized on AArch64 | [JDK-8354674](https://bugs.openjdk.org/browse/JDK-8354674) | ~2.5x | Tensor zeroing, blocked matmul result initialization | - -Source files: -- `skainet-lang/skainet-lang-core/src/jvmMain/kotlin/sk/ainet/lang/tensor/data/MemorySegmentTensorData.kt` -- `skainet-apps/skainet-kllama/src/jvmMain/kotlin/sk/ainet/apps/kllama/PagedKvCache.kt` - -#### Object layout and GC - -- **Compact Object Headers** (JEP 519) β€” reduces object header from 12 to 8 bytes. - Meaningful for tensor metadata arrays with millions of small objects. - Opt-in: `-XX:+UseCompactObjectHeaders` - -- **Generational Shenandoah** (JEP 521) β€” lower GC pause times for allocation-heavy - workloads (tensor creation, KV cache churn). - Opt-in: `-XX:+UseShenandoahGC -XX:ShenandoahGCMode=generational` - -#### Startup and warmup - -- **AOT profiling / caching** (JEP 515) β€” records JIT profile data from a training run - and replays it on subsequent launches. Reduces warmup by 15–25%. - Useful for CLI apps like kLLaMA where first-token latency matters. - -Usage: -``` -# Training run (records profile) -java -XX:AOTCacheOutput=app.aot -jar kllama.jar --prompt "warmup" - -# Production run (replays profile) -java -XX:AOTCache=app.aot -jar kllama.jar --prompt "Hello" -``` - -#### Recommended JVM flags for Java 25 - -Required (same as JDK 21–24): -``` ---enable-preview ---add-modules jdk.incubator.vector -``` - -Optional β€” enable for maximum benefit on JDK 25: -``` --XX:+UseCompactObjectHeaders --XX:+UseShenandoahGC -XX:ShenandoahGCMode=generational --XX:AOTCache=app.aot # after training run -``` - -#### Summary - -| Feature | Benefit | Component | -|---|---|---| -| VPointer refactoring (C2) | Up to 14x faster vector loads/stores | `JvmVectorKernels`, `JvmQuantizedVectorKernels` | -| SuperWord SIMD (C2) | Up to 33x faster auto-vectorized loops | Same vector kernel files | -| `Math.max/min` intrinsic | 3–5x faster long comparisons | Shape computation, tile clamping | -| Faster segment allocation | ~2x allocation throughput | `MemorySegmentTensorDataFactory`, `PagedKvCache` | -| `MemorySegment::fill` (AArch64) | ~2.5x faster bulk zeroing | Tensor init, matmul result buffers | -| Compact Object Headers | ~30% smaller object headers | All tensor metadata | -| Generational Shenandoah | Lower GC pauses | Allocation-heavy inference | -| AOT profiling | 15–25% faster warmup | CLI apps (kLLaMA) | diff --git a/docs/perf/jvm-cpu.md b/docs/perf/jvm-cpu.md deleted file mode 100644 index fc981566..00000000 --- a/docs/perf/jvm-cpu.md +++ /dev/null @@ -1,94 +0,0 @@ -### JVM CPU Backend Performance Benchmarks (JMH) - -This page explains how to run the JMH benchmarks for the JVM CPU backend and how to capture evidence for performance targets. - -#### What’s included -- Elementwise: FP32 `add` on 1,000,000 elements -- Reductions: FP32 `sum` and `mean` on 1,000,000 elements -- Matmul: FP32 square `matmul` with sizes 256, 512, and 1024 - -Benchmarks are implemented in module: -- `:skainet-backends:benchmarks:jvm-cpu-jmh` - -Source files: -- `src/jmh/kotlin/sk/ainet/bench/ElementwiseAdd1MBench.kt` -- `src/jmh/kotlin/sk/ainet/bench/Reductions1MBench.kt` -- `src/jmh/kotlin/sk/ainet/bench/MatmulBench.kt` - -#### Prerequisites -- JDK 21+ (JDK 22 toolchain configured by Gradle) -- Gradle will pass required JVM flags: - - `--enable-preview` - - `--add-modules jdk.incubator.vector` - -For Java 25-specific performance advantages, see [Java 25 CPU Backend](java-25-cpu-backend.md). - -#### Feature flags -You can toggle acceleration paths at runtime using system properties or environment variables: -- Vector acceleration: - - `-Dskainet.cpu.vector.enabled=true|false` - - or `SKAINET_CPU_VECTOR_ENABLED=true|false` -- BLAS via Panama (matmul heuristic for larger sizes): - - `-Dskainet.cpu.blas.enabled=true|false` - - or `SKAINET_CPU_BLAS_ENABLED=true|false` - -Each benchmark also exposes `@Param` to toggle these flags without modifying Gradle args. - -#### How to run all benchmarks -From repository root: - -``` -./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh -``` - -This will build and execute all JMH benchmarks with the default parameters defined in sources. - -#### Run specific benchmarks -- Elementwise add (both vector on/off): -``` -./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \ - -Pjmh.include=ElementwiseAdd1MBench -``` - -- Reductions (vector on/off): -``` -./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \ - -Pjmh.include=Reductions1MBench -``` - -- Matmul, all sizes, with vector on and BLAS on: -``` -./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \ - -Pjmh.include=MatmulBench \ - -Pjmh.param.vectorEnabled=true \ - -Pjmh.param.blasEnabled=true -``` - -- Matmul at 512 only, comparing BLAS on/off with vector on: -``` -./gradlew :skainet-backends:benchmarks:jvm-cpu-jmh:jmh \ - -Pjmh.include=MatmulBench \ - -Pjmh.param.size=512 \ - -Pjmh.param.vectorEnabled=true \ - -Pjmh.param.blasEnabled=true,false -``` - -Notes: -- You can also pass system properties via `-D` if preferred (e.g., `-Dskainet.cpu.vector.enabled=false`). -- JMH JSON/text results can be configured via standard JMH plugin options if you need files for CI artifacts. - -#### Recording environment details -Include at minimum: -- CPU model, cores/threads, base/boost clock -- RAM size and speed -- OS version -- JDK version and vendor -- Gradle version -- JVM flags in use (`--enable-preview --add-modules jdk.incubator.vector`) -- SKaiNET flags used (vector, BLAS) - -#### Performance targets (to be validated on your hardware) -- β‰₯ 4Γ— speedup on FP32 `matmul` 512Γ—512 vs baseline scalar -- β‰₯ 3Γ— speedup on FP32 `add` with 1M elements vs baseline scalar - -Use the above commands to produce β€œvector=false/blas=false” baselines vs β€œvector=true[/blas=true]” accelerated runs. Capture best-of or median-of JMH results as evidence and include raw tables in this document when available. diff --git a/docs/skainet-4-ai.md b/docs/skainet-4-ai.md deleted file mode 100644 index d9d2c6a6..00000000 --- a/docs/skainet-4-ai.md +++ /dev/null @@ -1,127 +0,0 @@ -# SKaiNET Core Technology: Tensor & Data Guide - -This document provides technical instructions for AI agents and developers on using SKaiNET's Tensor and Data API as a modern, type-safe replacement for NDArray or Python's NumPy library. - -## 1. Fundamental Architecture: Tensor Composition - -Unlike traditional libraries where a Tensor is a monolithic object, SKaiNET adopts a **compositional architecture**. A `Tensor` is composed of two primary components: - -1. **`TensorData`**: Handles multi-dimensional storage, memory layout, indexing, and type-safe element access. -2. **`TensorOps`**: Encapsulates mathematical algorithms and transformations (CPU, GPU, etc.). - -This separation allows for high flexibility, such as switching execution backends without changing the data representation. - -```kotlin -interface Tensor { - val data: TensorData - val ops: TensorOps - val dtype: KClass - val shape: Shape -} -``` - -## 2. Type-Safe Tensor Creation (DSL) - -SKaiNET provides a powerful Type-Safe DSL for tensor creation. It ensures that the data provided matches the specified `DType` at compile-time (or through the DSL's internal validation). - -### Creation with `ExecutionContext` - -Tensors are always created within an `ExecutionContext`, which provides the necessary `TensorOps` and `TensorDataFactory`. - -```kotlin -// Basic creation -val zeros = ctx.zeros(Shape(2, 3), FP32::class) -val ones = ctx.ones(Shape(1, 10), Int32::class) -val full = ctx.full(Shape(5, 5), FP32::class, 42.0f) -``` - -### Expressive Tensor DSL - -For more complex initializations, use the `tensor` DSL: - -```kotlin -val myTensor = tensor(ctx, FP32::class) { - shape(2, 2) { - from(1.0f, 2.0f, 3.0f, 4.0f) - } -} - -val randomTensor = tensor(ctx, FP32::class) { - shape(10, 10) { - randn(mean = 0f, std = 1f) - } -} - -val customInit = tensor(ctx, Int32::class) { - shape(5, 5) { - init { indices -> indices[0] + indices[1] } - } -} -``` - -## 3. Slicing DSL API - -SKaiNET offers a sophisticated Slicing DSL that allows for creating views or copies of tensor segments with high precision and readability. - -### `sliceView` vs `sliceCopy` - -- **`sliceView`**: Creates a `TensorView`, which is a window into the original data (no data copying). -- **`sliceCopy`**: Creates a new `Tensor` with a copy of the sliced data. - -### Slicing DSL Syntax - -The `SegmentBuilder` provides several ways to define slices for each dimension: - -- `range(start, end)`: A range of indices. -- `at(index)`: A single index (reduces rank). -- `all()`: All elements in that dimension (equivalent to `:` in NumPy). -- `step(start, end, step)`: Strided access. -- `+all()`: Short-hand for `all()`. - -```kotlin -val source = ctx.ones(Shape(10, 20, 30), FP32::class) - -// Slicing: [0:5, 10, :] -val view = source.sliceView { - segment { range(0, 5) } // Dim 0 - segment { at(10) } // Dim 1 - segment { all() } // Dim 2 -} -``` - -## 4. Core Operations (`TensorOps`) - -All mathematical operations are dispatched through the `TensorOps` interface. SKaiNET supports: - -- **Element-wise Ops**: `add`, `subtract`, `multiply`, `divide` (and scalar versions). -- **Linear Algebra**: `matmul`, `transpose`. -- **Neural Network Ops**: `conv2d`, `maxPool2d`, `relu`, `softmax`, `sigmoid`, `gelu`. -- **Reductions**: `sum`, `mean`, `variance`. -- **Shape Ops**: `reshape`, `flatten`, `concat`, `squeeze`, `unsqueeze`. - -### Operator Overloading - -When a tensor is "bound" to ops (e.g., via `OpsBoundTensor`), you can use standard Kotlin operators: - -```kotlin -val c = a + b // Calls ops.add(a, b) -val d = a * 10 // Calls ops.mulScalar(a, 10) -``` - -## 5. Summary Table: SKaiNET vs NumPy - -| Feature | NumPy | SKaiNET | -| :--- | :--- | :--- | -| **Primary Type** | `ndarray` | `Tensor` | -| **Creation** | `np.array([1, 2, 3])` | `tensor(ctx, FP32::class) { shape(3) { from(1f, 2f, 3f) } }` | -| **Zeros** | `np.zeros((2, 2))` | `ctx.zeros(Shape(2, 2), FP32::class)` | -| **Slicing** | `a[0:5, :]` | `a.sliceView { segment { range(0, 5) }; segment { all() } }` | -| **Matmul** | `a @ b` or `np.matmul(a, b)` | `ctx.ops.matmul(a, b)` | -| **Reshape** | `a.reshape(new_shape)` | `ctx.ops.reshape(a, Shape(new_shape))` | - -## 6. Best Practices for AI Integration - -1. **Context Awareness**: Always pass the `ExecutionContext` to functions that create or manipulate tensors. -2. **Type Safety**: Prefer specific `DType` classes (e.g., `FP32::class`, `Int32::class`) to avoid runtime errors. -3. **Views over Copies**: Use `sliceView` whenever possible to minimize memory overhead and improve performance. -4. **Backend Agnostic**: Write logic against the `TensorOps` interface to ensure your code runs on any supported backend.