diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 70418739..a12d24b3 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -2,16 +2,13 @@ .Tutorials * xref:tutorials/java-getting-started.adoc[Java getting started] -* xref:tutorials/kllama-getting-started.adoc[KLlama getting started] * xref:tutorials/hlo-getting-started.adoc[StableHLO getting started] * xref:tutorials/graph-dsl.adoc[Graph DSL] .How-to guides * xref:how-to/build.adoc[Build from source] * xref:how-to/io-readers.adoc[Load models (GGUF, SafeTensors, ONNX)] -* xref:how-to/java-cli-app.adoc[Build a Java CLI app] -* xref:how-to/java-llm-inference.adoc[Run LLM inference] -* xref:how-to/java-model-training.adoc[Train a model] +* xref:how-to/java-model-training.adoc[Train a model from Java] * xref:how-to/arduino-c-codegen.adoc[Generate C for Arduino] .Reference diff --git a/docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc b/docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc index 167aac22..36bcc71d 100644 --- a/docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc +++ b/docs/modules/ROOT/pages/explanation/perf/jvm-cpu.adoc @@ -20,12 +20,12 @@ Source files: ===== Prerequisites -* JDK 21{plus} (JDK 22 toolchain configured by Gradle) -* Gradle will pass required JVM flags: +* JDK 21{plus} (CI builds on JDK 25) +* Gradle passes the required JVM flags automatically: ** `--enable-preview` ** `--add-modules jdk.incubator.vector` -For Java 25-specific performance advantages, see link:java-25-cpu-backend.md[Java 25 CPU Backend]. +For JDK 25-specific performance advantages, see xref:explanation/perf/java-25-cpu-backend.adoc[Java 25 CPU Backend notes]. ===== Feature flags diff --git a/docs/modules/ROOT/pages/how-to/io-readers.adoc b/docs/modules/ROOT/pages/how-to/io-readers.adoc index 1f4b18da..550a020c 100644 --- a/docs/modules/ROOT/pages/how-to/io-readers.adoc +++ b/docs/modules/ROOT/pages/how-to/io-readers.adoc @@ -20,7 +20,7 @@ Add the following dependencies to your `build.gradle.kts`: [source,kotlin] ---- dependencies { - implementation("sk.ainet.core:skainet-io-gguf:0.5.0") + implementation("sk.ainet:skainet-io-gguf:0.19.0") implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2") } ---- @@ -30,7 +30,7 @@ dependencies { [source,kotlin] ---- dependencies { - implementation("sk.ainet.core:skainet-io-onnx:0.5.0") + implementation("sk.ainet:skainet-io-onnx:0.19.0") implementation("org.jetbrains.kotlinx:kotlinx-io-core:0.8.2") implementation("pro.streem.pbandk:pbandk-runtime:0.16.0") } diff --git a/docs/modules/ROOT/pages/how-to/java-cli-app.adoc b/docs/modules/ROOT/pages/how-to/java-cli-app.adoc index a233942d..65be9bfd 100644 --- a/docs/modules/ROOT/pages/how-to/java-cli-app.adoc +++ b/docs/modules/ROOT/pages/how-to/java-cli-app.adoc @@ -1,289 +1,22 @@ -== Building a Java CLI App with KLlama - -This guide walks you through creating a standalone Java 21{plus} command-line application that loads a LLaMA model and generates text using the KLlama library. - -=== Prerequisites - -* *JDK 21 or later* (required for Vector API and virtual threads) -* *Maven 3.8{plus}* or *Gradle 8.4{plus}* -* A GGUF model file (e.g., https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF[TinyLlama-1.1B-Chat GGUF]) - -''''' - -=== Project Setup - -==== Maven - -Create a `pom.xml`: - -[source,xml] ----- - - 4.0.0 - - com.example - kllama-cli - 1.0-SNAPSHOT - jar - - - 21 - 21 - 0.13.0 - - - - - - sk.ainet - skainet-bom - ${skainet.version} - pom - import - - - - - - - - sk.ainet - skainet-kllama-jvm - - - - - sk.ainet - skainet-backend-cpu-jvm - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.11.0 - - 21 - 21 - - --enable-preview - - - - - - - org.codehaus.mojo - exec-maven-plugin - 3.1.0 - - com.example.KLlamaCli - - --enable-preview - --add-modules - jdk.incubator.vector - - - - - - - org.apache.maven.plugins - maven-shade-plugin - 3.5.1 - - - package - shade - - - - com.example.KLlamaCli - - - - - - - - - ----- - -==== Gradle - -Create a `build.gradle` (Groovy DSL): - -[source,groovy] ----- -plugins { - id 'java' - id 'application' -} - -java { - toolchain { - languageVersion = JavaLanguageVersion.of(21) - } -} - -repositories { - mavenCentral() -} - -dependencies { - implementation platform('sk.ainet:skainet-bom:0.13.0') - implementation 'sk.ainet:skainet-kllama-jvm' - implementation 'sk.ainet:skainet-backend-cpu-jvm' -} - -application { - mainClass = 'com.example.KLlamaCli' - applicationDefaultJvmArgs = [ - '--enable-preview', - '--add-modules', 'jdk.incubator.vector' - ] -} - -tasks.withType(JavaCompile).configureEach { - options.compilerArgs.add('--enable-preview') -} ----- - -''''' - -=== Source Code - -Create `src/main/java/com/example/KLlamaCli.java`: - -[source,java] ----- -package com.example; - -import sk.ainet.apps.kllama.java.GenerationConfig; -import sk.ainet.apps.kllama.java.KLlamaJava; -import sk.ainet.apps.kllama.java.KLlamaSession; -import java.nio.file.Path; - -public class KLlamaCli { - - public static void main(String[] args) { - if (args.length < 2) { - System.err.println("Usage: kllama-cli \"\" [maxTokens] [temperature]"); - System.exit(1); - } - - Path modelPath = Path.of(args[0]); - String prompt = args[1]; - int maxTokens = args.length > 2 ? Integer.parseInt(args[2]) : 128; - float temperature = args.length > 3 ? Float.parseFloat(args[3]) : 0.8f; - - GenerationConfig config = GenerationConfig.builder() - .maxTokens(maxTokens) - .temperature(temperature) - .build(); - - System.out.println("Loading model from " + modelPath + " ..."); - - try (KLlamaSession session = KLlamaJava.loadGGUF(modelPath)) { - // Stream tokens to stdout as they are generated - session.generate(prompt, config, token -> System.out.print(token)); - System.out.println(); - } - } -} ----- - -''''' - -=== Building and Running - -==== With Maven - -[source,bash] ----- -# Run directly -mvn compile exec:java -Dexec.args="model.gguf 'Once upon a time' 128 0.7" - -# Build fat JAR -mvn package - -# Run from JAR -java --enable-preview --add-modules jdk.incubator.vector \ - -jar target/kllama-cli-1.0-SNAPSHOT.jar \ - model.gguf "Once upon a time" 128 0.7 ----- - -==== With Gradle - -[source,bash] ----- -# Run directly -./gradlew run --args="model.gguf 'Once upon a time' 128 0.7" - -# Build distribution -./gradlew installDist - -# Run from distribution -./build/install/kllama-cli/bin/kllama-cli \ - model.gguf "Once upon a time" 128 0.7 ----- - -''''' - -=== Loading SafeTensors Models - -To load a HuggingFace model directory instead of GGUF, use `loadSafeTensors` and point to the directory containing `model.safetensors`, `config.json`, and `tokenizer.json`: - -[source,java] ----- -try (KLlamaSession session = KLlamaJava.loadSafeTensors(Path.of("./my-llama-model/"))) { - session.generate("Hello", config, token -> System.out.print(token)); - System.out.println(); -} ----- - -''''' - -=== Async Generation - -Use `generateAsync` to run generation on a virtual thread and get a `CompletableFuture`: - -[source,java] ----- -import java.util.concurrent.CompletableFuture; - -try (KLlamaSession session = KLlamaJava.loadGGUF(modelPath)) { - CompletableFuture future = session.generateAsync( - "Explain quantum computing in one sentence", - GenerationConfig.builder().maxTokens(64).build() - ); - - // Do other work while generation runs... - - String result = future.join(); - System.out.println(result); -} ----- - -You can also compose futures: - -[source,java] ----- -session.generateAsync("Translate to French: Hello world") - .thenAccept(translation -> System.out.println("Translation: " + translation)) - .exceptionally(ex -> { ex.printStackTrace(); return null; }); ----- - -''''' - -=== Next Steps - -* link:java-llm-inference.md[Java LLM Inference Guide] — BERT embeddings, agent/tool-calling, and more. -* link:java-getting-started.md[Java Getting Started] — tensor operations, full Maven/Gradle setup. -* link:../skainet-apps/skainet-kllama/README.md[KLlama Library] — custom backends and Kotlin embedding. += Build a Java CLI app with KLlama — moved +:description: LLM CLI content moved to SKaiNET-transformers on 2026-04-13. + +[CAUTION] +==== +**This how-to moved.** The KLlama Java CLI example described here +depends on `sk.ainet:skainet-kllama-jvm` and the +`sk.ainet.apps.kllama.java` package, both of which now live in the +sibling https://github.com/SKaiNET-developers/SKaiNET-transformers[`SKaiNET-transformers`] +repository. Mainline `SKaiNET` kept the engine layer only. +==== + +Start here instead: + +* https://skainet-developers.github.io/SKaiNET-transformers/[`SKaiNET-transformers` documentation site] +* https://github.com/SKaiNET-developers/SKaiNET-transformers[`SKaiNET-transformers` GitHub] + +For the **non-LLM Java entry point** (tensor ops, models, training +loops on the CPU backend), the original content from this page's +predecessor guide still applies — it just doesn't cover LLM +inference. See xref:tutorials/java-getting-started.adoc[Java getting started] +and xref:how-to/java-model-training.adoc[Train a model from Java]. diff --git a/docs/modules/ROOT/pages/how-to/java-llm-inference.adoc b/docs/modules/ROOT/pages/how-to/java-llm-inference.adoc index 567b9aa1..bb324481 100644 --- a/docs/modules/ROOT/pages/how-to/java-llm-inference.adoc +++ b/docs/modules/ROOT/pages/how-to/java-llm-inference.adoc @@ -1,354 +1,34 @@ -== Java LLM Inference Guide - -This guide covers loading and running large language models (LLaMA, BERT) from Java using SKaiNET's blocking, streaming, and async APIs. - -=== Prerequisites - -* JDK 21{plus} with `--enable-preview --add-modules jdk.incubator.vector` -* See link:java-getting-started.md[Java Getting Started] for project setup - -==== Maven Dependencies - -[source,xml] ----- - - - - sk.ainet - skainet-bom - 0.13.0 - pom - import - - - - - - - - sk.ainet - skainet-kllama-jvm - - - - - sk.ainet - skainet-kllama-agent-jvm - - - - - sk.ainet - skainet-bert-jvm - - - - - sk.ainet - skainet-backend-cpu-jvm - - ----- - -''''' - -=== LLaMA Inference - -All LLaMA Java classes live in `sk.ainet.apps.kllama.java`. - -==== Loading a GGUF Model - -The simplest way to get started is to load a GGUF file. `KLlamaJava.loadGGUF()` handles context creation, weight loading, quantization dispatch, and tokenizer setup behind the scenes. - -[source,java] ----- -import sk.ainet.apps.kllama.java.KLlamaJava; -import sk.ainet.apps.kllama.java.KLlamaSession; -import sk.ainet.apps.kllama.java.GenerationConfig; -import java.nio.file.Path; - -public class LlamaExample { - public static void main(String[] args) { - try (KLlamaSession session = KLlamaJava.loadGGUF(Path.of("tinyllama-1.1b-q4.gguf"))) { - String response = session.generate("The capital of France is"); - System.out.println(response); - } - } -} ----- - -`KLlamaSession` implements `AutoCloseable`, so `try-with-resources` properly releases the off-heap memory arenas when you are done. - -==== Loading SafeTensors (HuggingFace Format) - -If you have a HuggingFace model directory containing `model.safetensors`, `config.json`, and `tokenizer.json`: - -[source,java] ----- -try (KLlamaSession session = KLlamaJava.loadSafeTensors(Path.of("./my-llama-model/"))) { - String response = session.generate("Once upon a time"); - System.out.println(response); -} ----- - -The directory must contain: - -* `model.safetensors` -- the model weights -* `config.json` -- model architecture config (hidden size, layers, heads, etc.) -* `tokenizer.json` -- HuggingFace tokenizer definition - -''''' - -=== GenerationConfig - -Control generation parameters with the builder pattern: - -[source,java] ----- -GenerationConfig config = GenerationConfig.builder() - .maxTokens(256) // maximum tokens to generate (default: 256) - .temperature(0.7f) // sampling temperature (default: 0.8) - .build(); - -String response = session.generate("Explain quantum computing", config); ----- - -Use `GenerationConfig.defaults()` for the default configuration (256 max tokens, 0.8 temperature). - -''''' - -=== Streaming Generation - -Pass a `Consumer++<++String++>++` to receive each token as it is generated. This is useful for displaying output in real time: - -[source,java] ----- -GenerationConfig config = GenerationConfig.builder() - .maxTokens(512) - .temperature(0.9f) - .build(); - -String fullResponse = session.generate( - "Write a haiku about Java", - config, - token -> System.out.print(token) // stream tokens to stdout -); - -System.out.println(); // newline after streaming ----- - -The `generate` overload with a `Consumer++<++String++>++` still returns the complete generated text as its return value. - -''''' - -=== Async Generation - -`generateAsync` offloads generation to a virtual thread and returns a `CompletableFuture++<++String++>++`: - -[source,java] ----- -import java.util.concurrent.CompletableFuture; - -CompletableFuture future = session.generateAsync( - "Summarize the theory of relativity", - GenerationConfig.builder().maxTokens(200).build() -); - -// Do other work while generation runs... -String result = future.join(); // block when you need the result -System.out.println(result); ----- - -You can also compose futures: - -[source,java] ----- -session.generateAsync("Translate to French: Hello world") - .thenAccept(translation -> System.out.println("Translation: " + translation)) - .exceptionally(ex -> { ex.printStackTrace(); return null; }); ----- - -''''' - -=== BERT Encoding and Similarity - -All BERT Java classes live in `sk.ainet.apps.bert.java`. - -==== Loading a BERT Model - -Load a BERT model from a HuggingFace directory containing `model.safetensors` and `vocab.txt`: - -[source,java] ----- -import sk.ainet.apps.bert.java.KBertJava; -import sk.ainet.apps.bert.java.KBertSession; -import java.nio.file.Path; - -try (KBertSession bert = KBertJava.loadSafeTensors(Path.of("./bert-base-uncased/"))) { - // Encode text into an embedding vector - float[] embedding = bert.encode("SKaiNET is a tensor framework"); - System.out.println("Embedding dimension: " + embedding.length); -} ----- - -The directory must contain: - -* `model.safetensors` -- BERT model weights -* `vocab.txt` -- WordPiece vocabulary -* `config.json` (optional) -- model config; defaults are used if absent - -==== Similarity Scoring - -Compute cosine similarity between two texts directly: - -[source,java] ----- -try (KBertSession bert = KBertJava.loadSafeTensors(Path.of("./bert-base-uncased/"))) { - float score = bert.similarity( - "The cat sat on the mat", - "A kitten rested on the rug" - ); - System.out.printf("Similarity: %.4f%n", score); // e.g. 0.8923 - - // Compare unrelated texts - float low = bert.similarity( - "The cat sat on the mat", - "Stock prices rose sharply" - ); - System.out.printf("Unrelated: %.4f%n", low); // e.g. 0.1247 -} ----- - -The returned value is cosine similarity in the range ++[++-1, 1++]++. - -''''' - -=== Agent Loop and Tool Calling - -All agent/tool classes live in `sk.ainet.apps.kllama.chat.java`. - -The `JavaAgentLoop` lets the LLM call tools in a loop until it produces a final answer. You define tools by implementing the `JavaTool` interface. - -==== Defining a Tool - -[source,java] ----- -import sk.ainet.apps.kllama.chat.java.JavaTool; -import sk.ainet.apps.kllama.chat.ToolDefinition; -import java.util.Map; - -public class CalculatorTool implements JavaTool { - - @Override - public ToolDefinition getDefinition() { - return new ToolDefinition( - "calculator", - "Evaluate a mathematical expression", - Map.of( - "expression", Map.of( - "type", "string", - "description", "The math expression to evaluate" - ) - ) - ); - } - - @Override - public String execute(Map arguments) { - String expr = (String) arguments.get("expression"); - // Your evaluation logic here - double result = evaluate(expr); - return String.valueOf(result); - } - - private double evaluate(String expr) { - // Simple evaluation implementation - // ... - return 0.0; - } -} ----- - -==== Building and Using the Agent - -[source,java] ----- -import sk.ainet.apps.kllama.java.KLlamaJava; -import sk.ainet.apps.kllama.java.KLlamaSession; -import sk.ainet.apps.kllama.chat.java.JavaAgentLoop; -import java.nio.file.Path; - -try (KLlamaSession session = KLlamaJava.loadGGUF(Path.of("model.gguf"))) { - - JavaAgentLoop agent = JavaAgentLoop.builder() - .session(session) - .tool(new CalculatorTool()) - .systemPrompt("You are a helpful assistant with access to a calculator.") - .template("llama3") // or "chatml" - .build(); - - // The agent will call the calculator tool if needed - String answer = agent.chat("What is 42 * 17?"); - System.out.println(answer); - - // Multi-turn conversation -- context is preserved - String followUp = agent.chat("Now divide that result by 3"); - System.out.println(followUp); - - // Reset conversation history (keeps system prompt) - agent.reset(); -} ----- - -==== Streaming Agent Responses - -[source,java] ----- -String answer = agent.chat( - "What is the square root of 144?", - token -> System.out.print(token) -); ----- - -''''' - -=== Resource Management - -Both `KLlamaSession` and `KBertSession` implement `AutoCloseable`. Always use `try-with-resources` to ensure off-heap memory arenas and other native resources are released promptly: - -[source,java] ----- -// Single session -try (KLlamaSession session = KLlamaJava.loadGGUF(path)) { - session.generate("Hello"); -} - -// Multiple sessions -try (KLlamaSession llama = KLlamaJava.loadGGUF(llamaPath); - KBertSession bert = KBertJava.loadSafeTensors(bertPath)) { - - String text = llama.generate("Write a summary of quantum mechanics"); - float[] embedding = bert.encode(text); -} ----- - -Failing to close sessions will leak off-heap memory allocated via `java.lang.foreign.Arena`. - -''''' - -=== Package Reference - -[cols=",",options="header",] -|=== -|Package |Key Classes -|`sk.ainet.apps.kllama.java` |`KLlamaJava`, `KLlamaSession`, `GenerationConfig` -|`sk.ainet.apps.bert.java` |`KBertJava`, `KBertSession` -|`sk.ainet.apps.kllama.chat.java` |`JavaAgentLoop`, `JavaTool` -|=== - -''''' - -=== Next Steps - -* link:java-getting-started.md[Java Getting Started] -- tensor operations, project setup, and dependency management. -* link:java-model-training.md[Model Training Guide] -- build and train neural networks from Java. += Run LLM inference from Java — moved +:description: LLM inference content moved to SKaiNET-transformers on 2026-04-13. + +[CAUTION] +==== +**This how-to moved.** The KLlama / KBert Java inference APIs +described here (`sk.ainet.apps.kllama.java.KLlamaJava`, +`sk.ainet.apps.bert.java.KBertJava`, +`sk.ainet.apps.kllama.chat.java.JavaAgentLoop`, and friends) now +live in the sibling +https://github.com/SKaiNET-developers/SKaiNET-transformers[`SKaiNET-transformers`] +repository. Mainline `SKaiNET` kept the engine layer only — +tensors, graph IR, backends, and model loading / tokenization. +==== + +Start here instead: + +* https://skainet-developers.github.io/SKaiNET-transformers/[`SKaiNET-transformers` documentation site] +* https://github.com/SKaiNET-developers/SKaiNET-transformers[`SKaiNET-transformers` GitHub] + +What stayed in mainline and is still useful for LLM workflows: + +* xref:how-to/io-readers.adoc[Load models (GGUF, SafeTensors, ONNX)] + — `StreamingGGUFReader`, `StreamingSafeTensorsReader`, zero-copy + file-backed loads, quantization-preserving paths. +* The `TokenizerFactory` in `skainet-io-core` dispatches to the right + implementation per model architecture — **Qwen / GPT-2 byte-level + BPE** via `QwenByteLevelBpeTokenizer`, **LLaMA / Gemma / TinyLlama + SentencePiece** via `SentencePieceTokenizer`. Both verified against + HuggingFace reference token IDs. Usable from Java via + `TokenizerFactory.fromGguf(fields)` or `fromTokenizerJson(json)`. +* The `TensorEncoding` metadata on `TensorSpec` carries Q4_K / Q8_0 / + TernaryPacked quant layout through the graph IR, so backends can + dispatch on it — see the 0.19.0 release notes. diff --git a/docs/modules/ROOT/pages/how-to/java-model-training.adoc b/docs/modules/ROOT/pages/how-to/java-model-training.adoc index ddb82976..0edec042 100644 --- a/docs/modules/ROOT/pages/how-to/java-model-training.adoc +++ b/docs/modules/ROOT/pages/how-to/java-model-training.adoc @@ -4,7 +4,7 @@ This guide covers building neural networks, defining loss functions and optimize === Prerequisites -* JDK 21{plus} with `--enable-preview --add-modules jdk.incubator.vector` +* JDK 21{plus} (CI builds on JDK 25); Gradle passes `--enable-preview --add-modules jdk.incubator.vector` automatically * See link:java-getting-started.md[Java Getting Started] for project setup ==== Maven Dependencies @@ -16,7 +16,7 @@ This guide covers building neural networks, defining loss functions and optimize sk.ainet skainet-bom - 0.13.0 + 0.19.0 pom import diff --git a/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc index becdecee..88565033 100644 --- a/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc +++ b/docs/modules/ROOT/pages/tutorials/java-getting-started.adoc @@ -4,7 +4,7 @@ This guide gets you from zero to running tensor operations with SKaiNET in under === Prerequisites -* *JDK 21 or later* (required for Vector API and virtual threads) +* *JDK 21 or later* (CI builds on JDK 25; Vector API and virtual threads require 21{plus}) * *Maven 3.8{plus}* or *Gradle 8.4{plus}* === JVM Flags @@ -29,7 +29,7 @@ The `skainet-bom` manages all SKaiNET module versions so you never have to keep ---- - 0.13.0 + 0.19.0 @@ -127,7 +127,7 @@ repositories { dependencies { // Import BOM for version alignment - implementation(platform("sk.ainet:skainet-bom:0.13.0")) + implementation(platform("sk.ainet:skainet-bom:0.19.0")) // Core tensor library implementation("sk.ainet:skainet-lang-core-jvm") diff --git a/docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc b/docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc index 153d32ef..73a95c23 100644 --- a/docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc +++ b/docs/modules/ROOT/pages/tutorials/kllama-getting-started.adoc @@ -1,26 +1,28 @@ -== KLlama Getting Started += KLlama is in SKaiNET-transformers now +:description: LLM runtimes moved to the sibling repo on 2026-04-13. -KLlama is a pure Kotlin LLaMA inference runtime that runs on JVM, Native, JS, and WebAssembly. It supports GGUF, SafeTensors, and Karpathy .bin model formats with on-the-fly quantization support. +[CAUTION] +==== +**This content moved.** LLM runtimes (KLlama, KBert, chat, tools, +agent loop) now live in the sibling +https://github.com/SKaiNET-developers/SKaiNET-transformers[`SKaiNET-transformers`] +repository. Mainline `SKaiNET` kept the engine layer only +(tensors, graph IR, compile / StableHLO, backends, tokenizers, +model loaders). +==== -____ -*Early Stage Development*: The project is in active development. We appreciate your feedback and bug reports! -____ +The getting-started guide for KLlama now lives alongside the +runtime it documents. Start here instead: -=== Choose Your Path +* https://github.com/SKaiNET-developers/SKaiNET-transformers[`SKaiNET-transformers` GitHub] +* https://skainet-developers.github.io/SKaiNET-transformers/[`SKaiNET-transformers` documentation site] -[cols=",",options="header",] -|=== -|Goal |Guide -|*Run models from the command line* |link:../skainet-apps/skainet-kllama-cli/README.md[KLlama CLI] -|*Embed in a Kotlin application* |link:../skainet-apps/skainet-kllama/README.md[KLlama Library] -|*Embed in a Java application* |link:java-llm-inference.md[Java LLM Inference Guide] -|*Build a standalone Java CLI app* |link:java-cli-app.md[Java CLI App Guide] -|*Java project setup (Maven / Gradle)* |link:java-getting-started.md[Java Getting Started] -|=== +If you were looking for the **tokenizer** side of LLM inference +(Qwen byte-level BPE, SentencePiece for LLaMA / Gemma / TinyLlama), +that still lives in mainline — see +xref:how-to/io-readers.adoc[Load models (GGUF, SafeTensors, ONNX)] +and the +https://github.com/SKaiNET-developers/SKaiNET/blob/develop/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/TokenizerFactory.kt[`TokenizerFactory` source]. -=== Quick Links - -* link:++../skainet-apps/skainet-kllama/README.md#supported-formats--quantization++[Supported formats & quantization] -* link:../skainet-apps/skainet-kllama/README.md#custom-backend-integration[Custom backend integration] -* link:java-llm-inference.md#agent-loop-and-tool-calling[Agent & tool calling] -* link:java-llm-inference.md#bert-encoding-and-similarity[BERT embeddings & similarity] +If you were looking for the **Java entry point** for running +tensor ops on CPU, see xref:tutorials/java-getting-started.adoc[Java getting started].