diff --git a/CHANGELOG.md b/CHANGELOG.md index 9820e62f..f9edcfc6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## [Unreleased] + +### Added +- **Qwen / GPT-2 Byte-Level BPE Tokenizer**: `QwenByteLevelBpeTokenizer` implements the full GPT-2-style pipeline — byte-to-unicode mapping, GPT-2 pretokenization regex, merge-rank BPE, and atomic special-token splitting. Builds from either GGUF metadata (`fromGgufFields`) or a HuggingFace `tokenizer.json` (`fromTokenizerJson`). Verified against Qwen2.5-0.5B reference token IDs from HuggingFace `transformers`. +- **`TokenizerFactory` with Per-Architecture Dispatch**: Tokenizer selection is now **per-architecture, not per file format**. `TokenizerFactory.fromGguf(fields)` and `.fromTokenizerJson(json)` inspect `tokenizer.ggml.model` / `model.type` and dispatch to the right implementation — so a Qwen model uses byte-level BPE whether its weights come from `.gguf` or `.safetensors`. +- **`Tokenizer` Interface**: Common surface implemented by `TekkenTokenizer` and `QwenByteLevelBpeTokenizer` (`encode`, `decode`, `vocabSize`, `bosTokenId`, `eosTokenId`). +- **GGUF Tokenizer Metadata**: `GgufModelMetadata` now exposes `tokenizerModel`, `tokenizerTokens`, `tokenizerMerges`, `tokenizerTokenTypes`, `bosTokenId`, and `eosTokenId` so callers can build a tokenizer without re-parsing the raw field map. + +### Fixed +- **Byte-Level BPE Broken for Qwen/GPT-2 Models**: Previously there was no GPT-2-style byte-level BPE tokenizer in the repo, and `GgufModelMetadata` ignored `tokenizer.ggml.merges` entirely — so any Qwen / GPT-2 / Mistral-Nemo model encoded text into garbage tokens (byte-level chars instead of merged vocab IDs), blocking chat mode and tool calling. The new `QwenByteLevelBpeTokenizer` + `TokenizerFactory` dispatch fix the issue for both GGUF and SafeTensors sources. SentencePiece / LLaMA support is tracked separately in #464. (#463) + ## [0.18.0] - 2026-04-08 ### Added diff --git a/skainet-io/skainet-io-core/build.gradle.kts b/skainet-io/skainet-io-core/build.gradle.kts index 36351129..a7e91566 100644 --- a/skainet-io/skainet-io-core/build.gradle.kts +++ b/skainet-io/skainet-io-core/build.gradle.kts @@ -2,6 +2,7 @@ import org.jetbrains.kotlin.gradle.ExperimentalWasmDsl import org.jetbrains.kotlin.gradle.dsl.JvmTarget +import java.net.URI plugins { alias(libs.plugins.kotlinMultiplatform) @@ -74,6 +75,7 @@ kotlin { dependencies { implementation(libs.kotlinx.coroutines) implementation(project(":skainet-backends:skainet-backend-cpu")) + implementation(project(":skainet-io:skainet-io-gguf")) } } @@ -97,3 +99,47 @@ kotlin { } } } + +// ============================================================================ +// Test fixtures for QwenByteLevelBpeTokenizer end-to-end tests (#463). +// +// Downloads a small public Qwen2.5 model + tokenizer.json into +// build/test-fixtures/. Tests check for file presence and skip cleanly +// when absent, so offline/CI builds without network still stay green. +// +// Run `./gradlew :skainet-io:skainet-io-core:downloadQwenTokenizerFixtures` +// once before running the fixture-gated tests. +// ============================================================================ +val fixturesDir = layout.buildDirectory.dir("test-fixtures") + +val downloadQwenTokenizerFixtures by tasks.registering { + group = "verification" + description = "Download Qwen2.5-0.5B GGUF + tokenizer.json for #463 tests" + val outDir = fixturesDir + outputs.dir(outDir) + doLast { + val dir = outDir.get().asFile.apply { mkdirs() } + val files = listOf( + "Qwen2.5-0.5B-Instruct-Q8_0.gguf" to + "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q8_0.gguf", + "tokenizer.json" to + "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct/resolve/main/tokenizer.json", + ) + for ((name, url) in files) { + val target = dir.resolve(name) + if (target.exists() && target.length() > 0) { + logger.lifecycle("fixture already present: ${target.name}") + continue + } + logger.lifecycle("downloading $name from $url") + URI(url).toURL().openStream().use { input -> + target.outputStream().use { out -> input.copyTo(out) } + } + logger.lifecycle(" -> ${target.length()} bytes") + } + } +} + +tasks.withType().configureEach { + systemProperty("skainet.test.fixturesDir", fixturesDir.get().asFile.absolutePath) +} diff --git a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/ByteToUnicode.kt b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/ByteToUnicode.kt new file mode 100644 index 00000000..8841cd15 --- /dev/null +++ b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/ByteToUnicode.kt @@ -0,0 +1,71 @@ +package sk.ainet.io.tokenizer + +/** + * GPT-2 byte-to-unicode mapping. + * + * Byte-level BPE tokenizers (GPT-2, Qwen, Mistral-Nemo, …) operate on a + * reversible map from every possible byte (0..255) to a unique printable + * Unicode code point. This avoids control characters and whitespace + * appearing as "bytes" inside BPE symbols, which would otherwise collide + * with regex pretokenization and JSON serialization. + * + * The table is the canonical one from Karpathy's `bytes_to_unicode` + * (see https://github.com/openai/gpt-2/blob/master/src/encoder.py and + * HuggingFace `tokenizers`): printable ASCII (`!`..`~`), Latin-1 + * supplement blocks (`¡`..`¬`, `®`..`ÿ`) map to themselves; every other + * byte is relocated into the 256..323 range. + * + * Every mapped code point is in the BMP (< U+10000), so `Char` iteration + * is sufficient — no surrogate-pair handling required. + */ +internal object ByteToUnicode { + + /** `byteToUnicode[b]` is the `Char` representing byte `b`. */ + val byteToUnicode: CharArray = buildByteToUnicode() + + /** Reverse lookup: `Char` → original byte (0..255). */ + val unicodeToByte: Map = buildUnicodeToByte(byteToUnicode) + + private fun buildByteToUnicode(): CharArray { + val printable = mutableListOf() + for (b in '!'.code..'~'.code) printable.add(b) + for (b in '¡'.code..'¬'.code) printable.add(b) + for (b in '®'.code..'ÿ'.code) printable.add(b) + + val printableSet = printable.toHashSet() + val result = CharArray(256) + for (b in printable) result[b] = b.toChar() + + var next = 256 + for (b in 0..255) { + if (b !in printableSet) { + result[b] = next.toChar() + next++ + } + } + return result + } + + private fun buildUnicodeToByte(forward: CharArray): Map { + val map = HashMap(256) + for (b in 0..255) map[forward[b]] = b.toByte() + return map + } + + /** Encode a UTF-8 byte sequence to its byte-level BPE string form. */ + fun encode(bytes: ByteArray): String { + val sb = StringBuilder(bytes.size) + for (b in bytes) sb.append(byteToUnicode[b.toInt() and 0xFF]) + return sb.toString() + } + + /** Decode a byte-level BPE string back to its UTF-8 byte sequence. */ + fun decode(s: String): ByteArray { + val out = ByteArray(s.length) + for (i in s.indices) { + out[i] = unicodeToByte[s[i]] + ?: error("byte-level BPE string contained unmapped char: U+${s[i].code.toString(16)}") + } + return out + } +} diff --git a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/QwenByteLevelBpeTokenizer.kt b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/QwenByteLevelBpeTokenizer.kt new file mode 100644 index 00000000..35c14bf2 --- /dev/null +++ b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/QwenByteLevelBpeTokenizer.kt @@ -0,0 +1,279 @@ +package sk.ainet.io.tokenizer + +import kotlinx.serialization.json.JsonObject +import kotlinx.serialization.json.boolean +import kotlinx.serialization.json.int +import kotlinx.serialization.json.jsonArray +import kotlinx.serialization.json.jsonObject +import kotlinx.serialization.json.jsonPrimitive + +/** + * GPT-2-style byte-level BPE tokenizer (Qwen, GPT-2, Mistral-Nemo, …). + * + * Implements the seven-step encoding pipeline required by HuggingFace + * `transformers` / `tokenizers` and llama.cpp for byte-level BPE: + * + * 1. Split input on the longest-match special token at each position + * (`<|im_start|>`, `<|endoftext|>`, …) — these are atomic token IDs. + * 2. For each non-special segment, apply the GPT-2 pretokenization regex. + * 3. UTF-8-encode each regex chunk. + * 4. Map bytes → unicode via [ByteToUnicode] (so "Hello" becomes `Hello`, + * " is" becomes `Ġis`, "\n" becomes `Ċ`). + * 5. Apply BPE merges to the resulting char sequence, always picking the + * pair with the **lowest merge rank** (not highest vocab score — that's + * the SentencePiece rule, not GPT-2 BPE). + * 6. Look up each resulting symbol in the vocab → token id. + * 7. Decode is the reverse: concat token strings, reverse byte-to-unicode, + * UTF-8-decode. + * + * @property tokens vocab, indexed by token id. Must include the byte-level + * base alphabet (256 single-char entries) and every merged symbol. + * @property merges merge list in **priority order** — rank 0 is the highest + * priority merge. Each entry is a `first second` pair of BPE symbols. + * @property specialTokens map from the literal string form (e.g. + * `"<|im_start|>"`) to its token id. Longest-match wins. + * @property bosTokenId optional BOS id (not emitted automatically by + * [encode]; callers add it if they want one). + * @property eosTokenId optional EOS id. + */ +public class QwenByteLevelBpeTokenizer( + tokens: List, + merges: List>, + private val specialTokens: Map, + override val bosTokenId: Int? = null, + override val eosTokenId: Int? = null, +) : Tokenizer { + + private val tokenToId: Map + private val idToToken: Array + private val mergeRank: Map, Int> + private val specialIdToString: Map + private val specialTokensByLengthDesc: List + + init { + tokenToId = HashMap(tokens.size * 2).also { m -> + for (i in tokens.indices) m[tokens[i]] = i + } + idToToken = tokens.toTypedArray() + mergeRank = HashMap, Int>(merges.size * 2).also { m -> + for (i in merges.indices) m[merges[i]] = i + } + specialIdToString = specialTokens.entries.associate { (k, v) -> v to k } + // Longest-first so `<|im_start|>` wins over a hypothetical `<|im`. + specialTokensByLengthDesc = specialTokens.keys.sortedByDescending { it.length } + } + + override val vocabSize: Int get() = idToToken.size + + override fun encode(text: String): IntArray { + val out = ArrayList(text.length) + var i = 0 + while (i < text.length) { + val matched = matchSpecialAt(text, i) + if (matched != null) { + out.add(specialTokens.getValue(matched)) + i += matched.length + continue + } + val nextSpecial = nextSpecialStart(text, i) + val segment = text.substring(i, nextSpecial) + encodeSegment(segment, out) + i = nextSpecial + } + return IntArray(out.size) { out[it] } + } + + override fun decode(ids: IntArray): String { + val sb = StringBuilder() + val byteBuf = ArrayList() + for (id in ids) { + val special = specialIdToString[id] + if (special != null) { + if (byteBuf.isNotEmpty()) { + sb.append(flushBytes(byteBuf)) + } + sb.append(special) + continue + } + val token = idToToken.getOrNull(id) + ?: error("decode: unknown token id $id") + for (c in token) { + val b = ByteToUnicode.unicodeToByte[c] + ?: error("decode: token '$token' contains non-byte-level char U+${c.code.toString(16)}") + byteBuf.add(b) + } + } + if (byteBuf.isNotEmpty()) sb.append(flushBytes(byteBuf)) + return sb.toString() + } + + private fun flushBytes(buf: ArrayList): String { + val arr = ByteArray(buf.size) { buf[it] } + buf.clear() + return arr.decodeToString() + } + + private fun matchSpecialAt(text: String, from: Int): String? { + for (tok in specialTokensByLengthDesc) { + if (tok.isNotEmpty() && text.regionMatches(from, tok, 0, tok.length)) return tok + } + return null + } + + private fun nextSpecialStart(text: String, from: Int): Int { + var best = text.length + for (tok in specialTokensByLengthDesc) { + if (tok.isEmpty()) continue + val idx = text.indexOf(tok, from + 1) + if (idx in 0 until best) best = idx + } + return best + } + + private fun encodeSegment(segment: String, out: ArrayList) { + if (segment.isEmpty()) return + for (match in PRETOKENIZE_REGEX.findAll(segment)) { + val chunk = match.value + if (chunk.isEmpty()) continue + val byteString = ByteToUnicode.encode(chunk.encodeToByteArray()) + val pieces = bpeMerge(byteString) + for (piece in pieces) { + val id = tokenToId[piece] + ?: error("BPE produced symbol not in vocab: '$piece' (from chunk '$chunk')") + out.add(id) + } + } + } + + private fun bpeMerge(word: String): List { + if (word.length <= 1) return listOf(word) + val pieces = ArrayList(word.length) + for (c in word) pieces.add(c.toString()) + + while (pieces.size > 1) { + var bestRank = Int.MAX_VALUE + var bestIdx = -1 + for (i in 0 until pieces.size - 1) { + val rank = mergeRank[pieces[i] to pieces[i + 1]] + if (rank != null && rank < bestRank) { + bestRank = rank + bestIdx = i + } + } + if (bestIdx == -1) break + pieces[bestIdx] = pieces[bestIdx] + pieces[bestIdx + 1] + pieces.removeAt(bestIdx + 1) + } + return pieces + } + + public companion object { + // GPT-2 pretokenization regex (Karpathy / HuggingFace). Splits text + // into word-like chunks before BPE so merges cannot cross word + // boundaries. Leading-space variants are intentional — that's how + // " is" encodes to a single `Ġis` token rather than ` ` + `is`. + private val PRETOKENIZE_REGEX = Regex( + "'(?:[sdmt]|ll|ve|re)| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+" + ) + + /** + * Build from GGUF metadata fields (see `GgufModelMetadata.rawFields`). + * + * Treats every token whose token_type code is `3` (control/special) + * as an atomic special token. Merges arrive as space-separated + * `"first second"` strings in GGUF. + */ + @Suppress("UNCHECKED_CAST") + public fun fromGgufFields(fields: Map): QwenByteLevelBpeTokenizer { + val tokens = (fields["tokenizer.ggml.tokens"] as? List<*>) + ?.filterIsInstance() + ?: error("tokenizer.ggml.tokens missing or malformed") + val mergesRaw = (fields["tokenizer.ggml.merges"] as? List<*>) + ?.filterIsInstance() + ?: error("tokenizer.ggml.merges missing — required for byte-level BPE") + val tokenTypes = (fields["tokenizer.ggml.token_type"] as? List<*>) + ?.mapNotNull { (it as? Number)?.toInt() } + + val merges = mergesRaw.map { line -> + val sp = line.indexOf(' ') + require(sp > 0) { "malformed merge line: '$line'" } + line.substring(0, sp) to line.substring(sp + 1) + } + + val specialTokens = HashMap() + if (tokenTypes != null) { + for (i in tokens.indices) { + if (i < tokenTypes.size && tokenTypes[i] == TOKEN_TYPE_CONTROL) { + specialTokens[tokens[i]] = i + } + } + } + + return QwenByteLevelBpeTokenizer( + tokens = tokens, + merges = merges, + specialTokens = specialTokens, + bosTokenId = (fields["tokenizer.ggml.bos_token_id"] as? Number)?.toInt(), + eosTokenId = (fields["tokenizer.ggml.eos_token_id"] as? Number)?.toInt(), + ) + } + + /** + * Build from a parsed HuggingFace `tokenizer.json` root object. + * + * Expects `model.type == "BPE"`. The caller ([TokenizerFactory]) is + * responsible for dispatch; this builder trusts the shape and fails + * loudly if required keys are missing. + */ + public fun fromTokenizerJson(root: JsonObject): QwenByteLevelBpeTokenizer { + val model = root["model"]?.jsonObject + ?: error("tokenizer.json missing 'model'") + val vocab = model["vocab"]?.jsonObject + ?: error("tokenizer.json missing 'model.vocab'") + + // Build tokens[] indexed by id. Vocab is a string -> id map; we + // invert it into an array. Gaps (should not happen in practice) + // are filled with empty strings so ids stay contiguous. + val maxId = vocab.values.maxOf { it.jsonPrimitive.int } + val tokens = Array(maxId + 1) { "" } + for ((tok, idEl) in vocab) { + tokens[idEl.jsonPrimitive.int] = tok + } + + val mergesJson = model["merges"]?.jsonArray + ?: error("tokenizer.json missing 'model.merges'") + val merges = mergesJson.map { el -> + when (el) { + is JsonObject -> error("tokenizer.json merges: object form not supported") + else -> { + val line = el.jsonPrimitive.content + val sp = line.indexOf(' ') + require(sp > 0) { "malformed merge line: '$line'" } + line.substring(0, sp) to line.substring(sp + 1) + } + } + } + + val specialTokens = HashMap() + val added = root["added_tokens"]?.jsonArray + if (added != null) { + for (entry in added) { + val obj = entry.jsonObject + val content = obj["content"]?.jsonPrimitive?.content ?: continue + val id = obj["id"]?.jsonPrimitive?.int ?: continue + val isSpecial = obj["special"]?.jsonPrimitive?.boolean ?: true + if (isSpecial) specialTokens[content] = id + } + } + + return QwenByteLevelBpeTokenizer( + tokens = tokens.toList(), + merges = merges, + specialTokens = specialTokens, + ) + } + + /** GGUF token type codes (ggml convention). */ + private const val TOKEN_TYPE_CONTROL = 3 + } +} diff --git a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/TekkenTokenizer.kt b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/TekkenTokenizer.kt index 64212e5b..003da340 100644 --- a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/TekkenTokenizer.kt +++ b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/TekkenTokenizer.kt @@ -34,7 +34,7 @@ public class TekkenTokenizer( private val specialTokensById: Map, private val numSpecialTokens: Int = 1000, private val pattern: Regex -) { +) : Tokenizer { /** BPE rank lookup: byte sequence → rank (merge priority). */ private val bytesToRank: HashMap = HashMap(vocabTokenBytes.size * 2) @@ -45,16 +45,16 @@ public class TekkenTokenizer( } /** Number of vocab tokens (excluding special tokens). */ - public val vocabSize: Int get() = vocabTokenBytes.size + override val vocabSize: Int get() = vocabTokenBytes.size /** Total token count (vocab + special tokens). */ public val totalTokens: Int get() = vocabTokenBytes.size + numSpecialTokens /** BOS token ID. */ - public val bosTokenId: Int get() = specialTokens[""] ?: 1 + override val bosTokenId: Int get() = specialTokens[""] ?: 1 /** EOS token ID. */ - public val eosTokenId: Int get() = specialTokens[""] ?: 2 + override val eosTokenId: Int get() = specialTokens[""] ?: 2 /** * Encode text to token IDs. @@ -63,7 +63,7 @@ public class TekkenTokenizer( * 2. For each chunk, convert to bytes and apply BPE merges * 3. Offset ranks by numSpecialTokens to get final IDs */ - public fun encode(text: String): IntArray { + override fun encode(text: String): IntArray { val tokens = mutableListOf() // Check for special tokens in the text first @@ -112,11 +112,11 @@ public class TekkenTokenizer( /** * Decode token IDs to text. */ - public fun decode(tokens: IntArray): String { + override fun decode(ids: IntArray): String { val bytes = mutableListOf() val result = StringBuilder() - for (id in tokens) { + for (id in ids) { if (id < numSpecialTokens) { // Flush accumulated bytes if (bytes.isNotEmpty()) { diff --git a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt new file mode 100644 index 00000000..2100d8d1 --- /dev/null +++ b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/Tokenizer.kt @@ -0,0 +1,18 @@ +package sk.ainet.io.tokenizer + +/** + * Common surface for all tokenizer implementations. + * + * Tokenizer selection is **per-architecture, not per file format** — see + * [TokenizerFactory]. A Qwen model needs byte-level BPE whether its weights + * come from `.gguf` or `.safetensors`; a LLaMA model needs SentencePiece + * regardless of format. + */ +public interface Tokenizer { + public val vocabSize: Int + public val bosTokenId: Int? + public val eosTokenId: Int? + + public fun encode(text: String): IntArray + public fun decode(ids: IntArray): String +} diff --git a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/TokenizerFactory.kt b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/TokenizerFactory.kt new file mode 100644 index 00000000..f9a5c5b7 --- /dev/null +++ b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/TokenizerFactory.kt @@ -0,0 +1,77 @@ +package sk.ainet.io.tokenizer + +import kotlinx.serialization.json.Json +import kotlinx.serialization.json.jsonObject +import kotlinx.serialization.json.jsonPrimitive + +/** + * Selects the right [Tokenizer] implementation for a model. + * + * Tokenizer selection is **per-architecture, not per file format.** A Qwen + * model needs byte-level BPE whether its weights come from `.gguf` or + * `.safetensors`; a LLaMA model needs SentencePiece regardless of format. + * Callers pass either a GGUF metadata field map or a HuggingFace + * `tokenizer.json` string, and this factory inspects the tokenizer type + * (`tokenizer.ggml.model` or `model.type`) to dispatch. + * + * Currently supported: Qwen / GPT-2-style byte-level BPE. SentencePiece + * (LLaMA/Gemma/TinyLlama) and WordPiece (BERT) throw + * [UnsupportedTokenizerException] — see #464. + */ +public object TokenizerFactory { + + /** + * Build a tokenizer from a GGUF metadata field map. + * + * Callers typically pass `streamingReader.fields` or + * `ggufModelMetadata.rawFields` — this keeps `skainet-io-core` free of a + * dependency on `skainet-io-gguf`. + */ + public fun fromGguf(fields: Map): Tokenizer { + val model = (fields["tokenizer.ggml.model"] as? String)?.lowercase() + ?: throw UnsupportedTokenizerException( + "GGUF metadata has no 'tokenizer.ggml.model' field" + ) + return when (model) { + "gpt2", "bpe" -> QwenByteLevelBpeTokenizer.fromGgufFields(fields) + "llama", "sentencepiece" -> throw UnsupportedTokenizerException( + "SentencePiece/LLaMA tokenizer not yet implemented (see #464)" + ) + "bert", "wordpiece" -> throw UnsupportedTokenizerException( + "WordPiece/BERT tokenizer not yet implemented" + ) + else -> throw UnsupportedTokenizerException( + "Unknown GGUF tokenizer.ggml.model: '$model'" + ) + } + } + + /** + * Build a tokenizer from a HuggingFace `tokenizer.json` string. + * + * Dispatches on `model.type`: `"BPE"` + byte-level pretokenizer routes + * to [QwenByteLevelBpeTokenizer]; `"Unigram"` (SentencePiece) and + * `"WordPiece"` currently throw. + */ + public fun fromTokenizerJson(json: String): Tokenizer { + val root = JSON.parseToJsonElement(json).jsonObject + val modelType = root["model"]?.jsonObject?.get("type")?.jsonPrimitive?.content + ?: throw UnsupportedTokenizerException("tokenizer.json has no model.type") + return when (modelType) { + "BPE" -> QwenByteLevelBpeTokenizer.fromTokenizerJson(root) + "Unigram" -> throw UnsupportedTokenizerException( + "Unigram/SentencePiece tokenizer.json not yet implemented (see #464)" + ) + "WordPiece" -> throw UnsupportedTokenizerException( + "WordPiece tokenizer.json not yet implemented" + ) + else -> throw UnsupportedTokenizerException( + "Unknown tokenizer.json model.type: '$modelType'" + ) + } + } + + internal val JSON: Json = Json { ignoreUnknownKeys = true; isLenient = true } +} + +public class UnsupportedTokenizerException(message: String) : RuntimeException(message) diff --git a/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/ByteToUnicodeTest.kt b/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/ByteToUnicodeTest.kt new file mode 100644 index 00000000..9f4da65a --- /dev/null +++ b/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/ByteToUnicodeTest.kt @@ -0,0 +1,65 @@ +package sk.ainet.io.tokenizer + +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +class ByteToUnicodeTest { + + @Test + fun `every byte maps to a unique char`() { + val seen = HashSet() + for (b in 0..255) { + val c = ByteToUnicode.byteToUnicode[b] + assertTrue(seen.add(c), "duplicate mapping for byte $b -> U+${c.code.toString(16)}") + } + assertEquals(256, seen.size) + } + + @Test + fun `byte to unicode round trip covers all 256 bytes`() { + val bytes = ByteArray(256) { it.toByte() } + val encoded = ByteToUnicode.encode(bytes) + val decoded = ByteToUnicode.decode(encoded) + assertEquals(256, encoded.length) + assertTrue(bytes.contentEquals(decoded), "round-trip failed") + } + + @Test + fun `printable ASCII maps to itself`() { + for (b in '!'.code..'~'.code) { + assertEquals(b.toChar(), ByteToUnicode.byteToUnicode[b]) + } + } + + @Test + fun `control characters are relocated into 256 range`() { + // Newline (0x0A), tab (0x09), space (0x20) are not in the printable + // set, so they must be relocated to >= 256. + assertTrue(ByteToUnicode.byteToUnicode[0x0A].code >= 256) + assertTrue(ByteToUnicode.byteToUnicode[0x09].code >= 256) + assertTrue(ByteToUnicode.byteToUnicode[0x20].code >= 256) + } + + @Test + fun `newline maps to canonical GPT-2 code point`() { + // Canonical GPT-2: 0x0A -> U+010A ('Ċ') + assertEquals('Ċ', ByteToUnicode.byteToUnicode[0x0A]) + } + + @Test + fun `space maps to canonical GPT-2 code point`() { + // Canonical GPT-2: 0x20 -> U+0120 ('Ġ') + assertEquals('Ġ', ByteToUnicode.byteToUnicode[0x20]) + } + + @Test + fun `utf-8 multi-byte round trip`() { + val input = "Héllo 世界\n" + val utf8 = input.encodeToByteArray() + val encoded = ByteToUnicode.encode(utf8) + val decoded = ByteToUnicode.decode(encoded) + assertTrue(utf8.contentEquals(decoded)) + assertEquals(input, decoded.decodeToString()) + } +} diff --git a/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/QwenByteLevelBpeTokenizerCoreTest.kt b/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/QwenByteLevelBpeTokenizerCoreTest.kt new file mode 100644 index 00000000..128bcefd --- /dev/null +++ b/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/QwenByteLevelBpeTokenizerCoreTest.kt @@ -0,0 +1,103 @@ +package sk.ainet.io.tokenizer + +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Synthetic tests for the BPE core. Builds a minimal tokenizer + * by hand so the algorithm can be locked in without a real model + * fixture — end-to-end reference tests against a real Qwen model + * live in `QwenByteLevelBpeTokenizerTest` (jvmTest). + */ +class QwenByteLevelBpeTokenizerCoreTest { + + /** + * Build a vocab containing: + * - the 256 byte-level base alphabet (so every byte has a token) + * - the five chars of "Hello" then the merges "He", "ll", "Hell", "Hello" + * + * Merge order matters: we want "Hello" to collapse to a single id. + */ + private fun buildToyTokenizer(): QwenByteLevelBpeTokenizer { + val tokens = mutableListOf() + for (b in 0..255) tokens.add(ByteToUnicode.byteToUnicode[b].toString()) + val merges = mutableListOf>() + + fun addMerge(a: String, b: String) { + merges.add(a to b) + tokens.add(a + b) + } + addMerge("H", "e") // He + addMerge("l", "l") // ll + addMerge("He", "ll") // Hell + addMerge("Hell", "o") // Hello + + return QwenByteLevelBpeTokenizer( + tokens = tokens, + merges = merges, + specialTokens = mapOf("<|end|>" to tokens.size.also { tokens.add("<|end|>") }) + ) + } + + @Test + fun `merges collapse Hello to a single token`() { + val tok = buildToyTokenizer() + val ids = tok.encode("Hello") + assertEquals(1, ids.size, "Hello must collapse to one merged token, got ${ids.toList()}") + assertEquals("Hello", tok.decode(ids)) + } + + @Test + fun `decode is inverse of encode for ascii`() { + val tok = buildToyTokenizer() + val input = "Hello!" + assertEquals(input, tok.decode(tok.encode(input))) + } + + @Test + fun `special tokens are atomic and not BPE-merged`() { + val tok = buildToyTokenizer() + val ids = tok.encode("Hello<|end|>") + // first id: merged "Hello"; second id: the special + assertEquals(2, ids.size) + assertEquals("Hello<|end|>", tok.decode(ids)) + } + + @Test + fun `newline survives round trip as a single byte token`() { + val tok = buildToyTokenizer() + val ids = tok.encode("\n") + assertEquals(1, ids.size) + assertEquals("\n", tok.decode(ids)) + } + + @Test + fun `unknown merge pairs fall through to byte tokens`() { + val tok = buildToyTokenizer() + // 'z' has no merges — must emit a single-byte token. + val ids = tok.encode("z") + assertEquals(1, ids.size) + assertEquals("z", tok.decode(ids)) + } + + @Test + fun `leading space attaches to following word via pretokenize regex`() { + val tok = buildToyTokenizer() + // " Hello" pretokenizes to one chunk; no merge exists for Ġ+H, + // so we expect 2 tokens: " " (as Ġ) + the merged "Hello" — but + // actually since the chunk is " Hello", BPE runs over `ĠHello` + // and finds no merge for Ġ+H, so we get Ġ, H, e, l, l, o initially, + // then "He"+"ll"->Hell, Hell+o->Hello, giving [Ġ, Hello]. + val ids = tok.encode(" Hello") + assertEquals(2, ids.size, "expected [Ġ, Hello], got ${ids.toList()}") + assertEquals(" Hello", tok.decode(ids)) + } + + @Test + fun `vocab size reflects added tokens`() { + val tok = buildToyTokenizer() + // 256 bytes + 4 merges + 1 special + assertTrue(tok.vocabSize >= 256 + 4) + } +} diff --git a/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/TokenizerFactoryDispatchTest.kt b/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/TokenizerFactoryDispatchTest.kt new file mode 100644 index 00000000..ecfa58c8 --- /dev/null +++ b/skainet-io/skainet-io-core/src/commonTest/kotlin/sk/ainet/io/tokenizer/TokenizerFactoryDispatchTest.kt @@ -0,0 +1,97 @@ +package sk.ainet.io.tokenizer + +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertFailsWith +import kotlin.test.assertTrue + +class TokenizerFactoryDispatchTest { + + /** + * Build a minimal stub GGUF field map: 256 base byte tokens + 4 merges + * that collapse "Hello" to a single id + one control token. + */ + private fun qwenStyleFields(): Map { + val tokens = mutableListOf() + for (b in 0..255) tokens.add(ByteToUnicode.byteToUnicode[b].toString()) + val merges = mutableListOf() + fun add(a: String, b: String) { + merges.add("$a $b") + tokens.add(a + b) + } + add("H", "e") + add("l", "l") + add("He", "ll") + add("Hell", "o") + tokens.add("<|end|>") + val types = IntArray(tokens.size) { if (it == tokens.lastIndex) 3 else 1 }.toList() + return mapOf( + "tokenizer.ggml.model" to "gpt2", + "tokenizer.ggml.tokens" to tokens, + "tokenizer.ggml.merges" to merges, + "tokenizer.ggml.token_type" to types, + "tokenizer.ggml.bos_token_id" to 42, + "tokenizer.ggml.eos_token_id" to tokens.lastIndex, + ) + } + + @Test + fun `gguf gpt2 dispatches to byte level BPE`() { + val tok = TokenizerFactory.fromGguf(qwenStyleFields()) + assertTrue(tok is QwenByteLevelBpeTokenizer) + val ids = tok.encode("Hello<|end|>") + assertEquals(2, ids.size) + assertEquals("Hello<|end|>", tok.decode(ids)) + } + + @Test + fun `gguf bos and eos propagate from fields`() { + val tok = TokenizerFactory.fromGguf(qwenStyleFields()) + assertEquals(42, tok.bosTokenId) + } + + @Test + fun `gguf llama throws UnsupportedTokenizerException`() { + assertFailsWith { + TokenizerFactory.fromGguf(mapOf("tokenizer.ggml.model" to "llama")) + } + } + + @Test + fun `gguf missing model field throws`() { + assertFailsWith { + TokenizerFactory.fromGguf(emptyMap()) + } + } + + @Test + fun `tokenizer_json BPE dispatches to byte level BPE`() { + // Synthesize a minimal tokenizer.json with 2 vocab entries + 1 merge. + val json = """ + { + "version": "1.0", + "added_tokens": [ + {"id": 2, "content": "<|end|>", "special": true} + ], + "model": { + "type": "BPE", + "vocab": {"a": 0, "b": 1, "<|end|>": 2, "ab": 3}, + "merges": ["a b"] + } + } + """.trimIndent() + val tok = TokenizerFactory.fromTokenizerJson(json) + assertTrue(tok is QwenByteLevelBpeTokenizer) + val ids = tok.encode("ab<|end|>") + // "ab" should merge to id 3, then special id 2 + assertEquals(listOf(3, 2), ids.toList()) + } + + @Test + fun `tokenizer_json Unigram throws`() { + val json = """{"model":{"type":"Unigram","vocab":[]}}""" + assertFailsWith { + TokenizerFactory.fromTokenizerJson(json) + } + } +} diff --git a/skainet-io/skainet-io-core/src/jvmTest/kotlin/sk/ainet/io/tokenizer/QwenByteLevelBpeTokenizerFixtureTest.kt b/skainet-io/skainet-io-core/src/jvmTest/kotlin/sk/ainet/io/tokenizer/QwenByteLevelBpeTokenizerFixtureTest.kt new file mode 100644 index 00000000..e40bed4a --- /dev/null +++ b/skainet-io/skainet-io-core/src/jvmTest/kotlin/sk/ainet/io/tokenizer/QwenByteLevelBpeTokenizerFixtureTest.kt @@ -0,0 +1,148 @@ +package sk.ainet.io.tokenizer + +import sk.ainet.io.JvmRandomAccessSource +import sk.ainet.io.gguf.GgufModelMetadata +import sk.ainet.io.gguf.StreamingGGUFReader +import java.io.File +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * End-to-end reference tests against the real Qwen2.5-0.5B-Instruct tokenizer. + * + * These tests are **gated on an external fixture** that is not committed + * to the repo. Run: + * + * ./gradlew :skainet-io:skainet-io-core:downloadQwenTokenizerFixtures + * + * once to download the files into build/test-fixtures/. When the fixture + * is absent (CI / offline builds), tests print a skip notice and pass — + * the format-independent unit tests in commonTest still exercise the core + * algorithm without needing network access. + * + * Expected token IDs come from HuggingFace `transformers`: + * from transformers import AutoTokenizer + * tok = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct") + * tok.encode("Hello", add_special_tokens=False) # [9707] + * tok.encode("<|im_start|>", add_special_tokens=False) # [151644] + */ +class QwenByteLevelBpeTokenizerFixtureTest { + + private val fixturesDir: File = File( + System.getProperty("skainet.test.fixturesDir") + ?: (System.getProperty("user.dir") + "/build/test-fixtures") + ) + private val ggufFile = File(fixturesDir, "Qwen2.5-0.5B-Instruct-Q8_0.gguf") + private val tokenizerJsonFile = File(fixturesDir, "tokenizer.json") + + private fun skipIfMissing(files: List): Boolean { + val missing = files.filterNot { it.exists() && it.length() > 0 } + if (missing.isEmpty()) return false + println( + "[skip] QwenByteLevelBpeTokenizerFixtureTest: missing fixture(s) " + + missing.joinToString { it.name } + + " — run ':skainet-io:skainet-io-core:downloadQwenTokenizerFixtures'" + ) + return true + } + + private fun loadFromGguf(): Tokenizer { + return JvmRandomAccessSource.open(ggufFile).use { src -> + StreamingGGUFReader.open(src).use { reader -> + TokenizerFactory.fromGguf(reader.fields) + } + } + } + + private fun loadFromJson(): Tokenizer = + TokenizerFactory.fromTokenizerJson(tokenizerJsonFile.readText()) + + @Test + fun `single ASCII word encodes to single Qwen token`() { + if (skipIfMissing(listOf(ggufFile))) return + val tok = loadFromGguf() + assertEquals(listOf(9707), tok.encode("Hello").toList()) + } + + @Test + fun `special chat template token encodes as one atomic token`() { + if (skipIfMissing(listOf(ggufFile))) return + val tok = loadFromGguf() + assertEquals(listOf(151644), tok.encode("<|im_start|>").toList()) + } + + @Test + fun `sentence encodes to known Qwen2_5 token sequence`() { + if (skipIfMissing(listOf(ggufFile))) return + val tok = loadFromGguf() + assertEquals( + listOf(785, 6722, 315, 9625, 374), + tok.encode("The capital of France is").toList() + ) + } + + @Test + fun `newline encodes as single Qwen byte token`() { + if (skipIfMissing(listOf(ggufFile))) return + val tok = loadFromGguf() + assertEquals(listOf(198), tok.encode("\n").toList()) + } + + @Test + fun `encode then decode is identity for ASCII`() { + if (skipIfMissing(listOf(ggufFile))) return + val tok = loadFromGguf() + val input = "The capital of France is" + assertEquals(input, tok.decode(tok.encode(input))) + } + + @Test + fun `encode then decode is identity for text with special tokens`() { + if (skipIfMissing(listOf(ggufFile))) return + val tok = loadFromGguf() + val input = "<|im_start|>user\nHello<|im_end|>" + assertEquals(input, tok.decode(tok.encode(input))) + } + + @Test + fun `chat template prompt starts with expected IDs`() { + if (skipIfMissing(listOf(ggufFile))) return + val tok = loadFromGguf() + val prompt = "<|im_start|>system\nYou are helpful.<|im_end|>\n" + + "<|im_start|>user\nHi<|im_end|>\n" + + "<|im_start|>assistant\n" + val ids = tok.encode(prompt) + assertTrue(ids.size > 10) + assertEquals(151644, ids[0]) // <|im_start|> + assertEquals(8948, ids[1]) // system + assertEquals(198, ids[2]) // newline + } + + @Test + fun `GGUF and tokenizer_json produce identical token ids`() { + if (skipIfMissing(listOf(ggufFile, tokenizerJsonFile))) return + val ggufTok = loadFromGguf() + val jsonTok = loadFromJson() + val samples = listOf( + "Hello", + "The capital of France is", + "<|im_start|>user\nHi<|im_end|>", + "\n", + "What is 2 + 2?", + ) + for (text in samples) { + assertEquals( + ggufTok.encode(text).toList(), + jsonTok.encode(text).toList(), + "mismatch for '$text'" + ) + } + } + + @Test + fun `GGUF dispatches to QwenByteLevelBpeTokenizer`() { + if (skipIfMissing(listOf(ggufFile))) return + assertTrue(loadFromGguf() is QwenByteLevelBpeTokenizer) + } +} diff --git a/skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/GgufModelMetadata.kt b/skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/GgufModelMetadata.kt index dd5e4f99..c0a85959 100644 --- a/skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/GgufModelMetadata.kt +++ b/skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/GgufModelMetadata.kt @@ -56,9 +56,39 @@ public data class GgufModelMetadata( /** Number of layers */ val layerCount: Int?, - /** Vocabulary size */ + /** Vocabulary size — derived from the tokenizer tokens list when present. */ val vocabSize: Int?, + /** + * Tokenizer model identifier (`tokenizer.ggml.model`), e.g. `"gpt2"`, + * `"llama"`, `"bert"`. Used by `TokenizerFactory` to dispatch to the + * right tokenizer implementation regardless of file format. + */ + val tokenizerModel: String? = null, + + /** Full vocab as stored in `tokenizer.ggml.tokens` (index = token id). */ + val tokenizerTokens: List? = null, + + /** + * Merge list from `tokenizer.ggml.merges`, each entry formatted as + * `"first second"` (space-separated). Priority order — index 0 is the + * highest-priority merge. + */ + val tokenizerMerges: List? = null, + + /** + * Per-token type codes from `tokenizer.ggml.token_type`. GGUF convention: + * 1 = normal, 2 = unknown, 3 = control/special, 4 = user-defined, + * 5 = unused, 6 = byte. + */ + val tokenizerTokenTypes: List? = null, + + /** BOS token id from `tokenizer.ggml.bos_token_id`, if present. */ + val bosTokenId: Int? = null, + + /** EOS token id from `tokenizer.ggml.eos_token_id`, if present. */ + val eosTokenId: Int? = null, + /** All raw metadata fields for custom access */ val rawFields: Map ) { @@ -80,6 +110,7 @@ public data class GgufModelMetadata( * @return Structured metadata */ public fun from(fields: Map): GgufModelMetadata { + val tokenizerTokens = fields.getStringList("tokenizer.ggml.tokens") return GgufModelMetadata( architecture = fields.getString("general.architecture"), name = fields.getString("general.name"), @@ -123,10 +154,14 @@ public data class GgufModelMetadata( "general.layer_count", "model.layer_count" ), - vocabSize = fields.getInt( - "llama.vocab_size", - "tokenizer.ggml.tokens" - )?.let { if (it > 0) it else null }, + vocabSize = tokenizerTokens?.size + ?: fields.getInt("llama.vocab_size")?.takeIf { it > 0 }, + tokenizerModel = fields.getString("tokenizer.ggml.model"), + tokenizerTokens = tokenizerTokens, + tokenizerMerges = fields.getStringList("tokenizer.ggml.merges"), + tokenizerTokenTypes = fields.getIntList("tokenizer.ggml.token_type"), + bosTokenId = fields.getInt("tokenizer.ggml.bos_token_id"), + eosTokenId = fields.getInt("tokenizer.ggml.eos_token_id"), rawFields = fields ) } @@ -152,6 +187,21 @@ public data class GgufModelMetadata( return null } + private fun Map.getIntList(vararg keys: String): List? { + for (key in keys) { + val value = this[key] ?: continue + val ints = when (value) { + is List<*> -> value.mapNotNull { (it as? Number)?.toInt() } + is Array<*> -> value.mapNotNull { (it as? Number)?.toInt() } + is IntArray -> value.toList() + is LongArray -> value.map { it.toInt() } + else -> null + } + if (ints != null && ints.isNotEmpty()) return ints + } + return null + } + @Suppress("UNCHECKED_CAST") private fun Map.getStringList(vararg keys: String): List? { for (key in keys) { diff --git a/skainet-io/skainet-io-gguf/src/commonTest/kotlin/sk/ainet/io/gguf/GgufModelMetadataTokenizerTest.kt b/skainet-io/skainet-io-gguf/src/commonTest/kotlin/sk/ainet/io/gguf/GgufModelMetadataTokenizerTest.kt new file mode 100644 index 00000000..22a2c1b3 --- /dev/null +++ b/skainet-io/skainet-io-gguf/src/commonTest/kotlin/sk/ainet/io/gguf/GgufModelMetadataTokenizerTest.kt @@ -0,0 +1,51 @@ +package sk.ainet.io.gguf + +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertNull + +class GgufModelMetadataTokenizerTest { + + @Test + fun `extracts tokenizer fields from raw map`() { + val fields = mapOf( + "general.architecture" to "qwen2", + "tokenizer.ggml.model" to "gpt2", + "tokenizer.ggml.tokens" to listOf("!", "\"", "#", "Hello", "<|im_start|>"), + "tokenizer.ggml.merges" to listOf("H e", "He l", "Hel l", "Hell o"), + "tokenizer.ggml.token_type" to listOf(1, 1, 1, 1, 3), + "tokenizer.ggml.bos_token_id" to 151643, + "tokenizer.ggml.eos_token_id" to 151645, + ) + + val md = GgufModelMetadata.from(fields) + + assertEquals("qwen2", md.architecture) + assertEquals("gpt2", md.tokenizerModel) + assertEquals(5, md.vocabSize) + assertEquals(5, md.tokenizerTokens?.size) + assertEquals("Hello", md.tokenizerTokens?.get(3)) + assertEquals(4, md.tokenizerMerges?.size) + assertEquals("H e", md.tokenizerMerges?.get(0)) + assertEquals(listOf(1, 1, 1, 1, 3), md.tokenizerTokenTypes) + assertEquals(151643, md.bosTokenId) + assertEquals(151645, md.eosTokenId) + } + + @Test + fun `missing tokenizer fields stay null`() { + val md = GgufModelMetadata.from(mapOf("general.architecture" to "llama")) + assertNull(md.tokenizerModel) + assertNull(md.tokenizerTokens) + assertNull(md.tokenizerMerges) + assertNull(md.tokenizerTokenTypes) + assertNull(md.bosTokenId) + assertNull(md.eosTokenId) + } + + @Test + fun `vocab size falls back to llama vocab_size when no tokens list`() { + val md = GgufModelMetadata.from(mapOf("llama.vocab_size" to 32000)) + assertEquals(32000, md.vocabSize) + } +}