SKaiNET-developers · michalharakal · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,16 @@
 # Changelog
 
+## [Unreleased]
+
+### Added
+- **Qwen / GPT-2 Byte-Level BPE Tokenizer**: `QwenByteLevelBpeTokenizer` implements the full GPT-2-style pipeline — byte-to-unicode mapping, GPT-2 pretokenization regex, merge-rank BPE, and atomic special-token splitting. Builds from either GGUF metadata (`fromGgufFields`) or a HuggingFace `tokenizer.json` (`fromTokenizerJson`). Verified against Qwen2.5-0.5B reference token IDs from HuggingFace `transformers`.
+- **`TokenizerFactory` with Per-Architecture Dispatch**: Tokenizer selection is now **per-architecture, not per file format**. `TokenizerFactory.fromGguf(fields)` and `.fromTokenizerJson(json)` inspect `tokenizer.ggml.model` / `model.type` and dispatch to the right implementation — so a Qwen model uses byte-level BPE whether its weights come from `.gguf` or `.safetensors`.
+- **`Tokenizer` Interface**: Common surface implemented by `TekkenTokenizer` and `QwenByteLevelBpeTokenizer` (`encode`, `decode`, `vocabSize`, `bosTokenId`, `eosTokenId`).
+- **GGUF Tokenizer Metadata**: `GgufModelMetadata` now exposes `tokenizerModel`, `tokenizerTokens`, `tokenizerMerges`, `tokenizerTokenTypes`, `bosTokenId`, and `eosTokenId` so callers can build a tokenizer without re-parsing the raw field map.
+
+### Fixed
+- **Byte-Level BPE Broken for Qwen/GPT-2 Models**: Previously there was no GPT-2-style byte-level BPE tokenizer in the repo, and `GgufModelMetadata` ignored `tokenizer.ggml.merges` entirely — so any Qwen / GPT-2 / Mistral-Nemo model encoded text into garbage tokens (byte-level chars instead of merged vocab IDs), blocking chat mode and tool calling. The new `QwenByteLevelBpeTokenizer` + `TokenizerFactory` dispatch fix the issue for both GGUF and SafeTensors sources. SentencePiece / LLaMA support is tracked separately in #464. (#463)
+
 ## [0.18.0] - 2026-04-08
 
 ### Added

diff --git a/skainet-io/skainet-io-core/build.gradle.kts b/skainet-io/skainet-io-core/build.gradle.kts
@@ -2,6 +2,7 @@
 
 import org.jetbrains.kotlin.gradle.ExperimentalWasmDsl
 import org.jetbrains.kotlin.gradle.dsl.JvmTarget
+import java.net.URI
 
 plugins {
     alias(libs.plugins.kotlinMultiplatform)
@@ -74,6 +75,7 @@ kotlin {
             dependencies {
                 implementation(libs.kotlinx.coroutines)
                 implementation(project(":skainet-backends:skainet-backend-cpu"))
+                implementation(project(":skainet-io:skainet-io-gguf"))
             }
         }
 
@@ -97,3 +99,47 @@ kotlin {
         }
     }
 }
+
+// ============================================================================
+// Test fixtures for QwenByteLevelBpeTokenizer end-to-end tests (#463).
+//
+// Downloads a small public Qwen2.5 model + tokenizer.json into
+// build/test-fixtures/. Tests check for file presence and skip cleanly
+// when absent, so offline/CI builds without network still stay green.
+//
+// Run `./gradlew :skainet-io:skainet-io-core:downloadQwenTokenizerFixtures`
+// once before running the fixture-gated tests.
+// ============================================================================
+val fixturesDir = layout.buildDirectory.dir("test-fixtures")
+
+val downloadQwenTokenizerFixtures by tasks.registering {
+    group = "verification"
+    description = "Download Qwen2.5-0.5B GGUF + tokenizer.json for #463 tests"
+    val outDir = fixturesDir
+    outputs.dir(outDir)
+    doLast {
+        val dir = outDir.get().asFile.apply { mkdirs() }
+        val files = listOf(
+            "Qwen2.5-0.5B-Instruct-Q8_0.gguf" to
+                "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q8_0.gguf",
+            "tokenizer.json" to
+                "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct/resolve/main/tokenizer.json",
+        )
+        for ((name, url) in files) {
+            val target = dir.resolve(name)
+            if (target.exists() && target.length() > 0) {
+                logger.lifecycle("fixture already present: ${target.name}")
+                continue
+            }
+            logger.lifecycle("downloading $name from $url")
+            URI(url).toURL().openStream().use { input ->
+                target.outputStream().use { out -> input.copyTo(out) }
+            }
+            logger.lifecycle("  -> ${target.length()} bytes")
+        }
+    }
+}
+
+tasks.withType<Test>().configureEach {
+    systemProperty("skainet.test.fixturesDir", fixturesDir.get().asFile.absolutePath)
+}
diff --git a/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/ByteToUnicode.kt b/skainet-io/skainet-io-core/src/commonMain/kotlin/sk/ainet/io/tokenizer/ByteToUnicode.kt
@@ -0,0 +1,71 @@
+package sk.ainet.io.tokenizer
+
+/**
+ * GPT-2 byte-to-unicode mapping.
+ *
+ * Byte-level BPE tokenizers (GPT-2, Qwen, Mistral-Nemo, …) operate on a
+ * reversible map from every possible byte (0..255) to a unique printable
+ * Unicode code point. This avoids control characters and whitespace
+ * appearing as "bytes" inside BPE symbols, which would otherwise collide
+ * with regex pretokenization and JSON serialization.
+ *
+ * The table is the canonical one from Karpathy's `bytes_to_unicode`
+ * (see https://github.com/openai/gpt-2/blob/master/src/encoder.py and
+ * HuggingFace `tokenizers`): printable ASCII (`!`..`~`), Latin-1
+ * supplement blocks (`¡`..`¬`, `®`..`ÿ`) map to themselves; every other
+ * byte is relocated into the 256..323 range.
+ *
+ * Every mapped code point is in the BMP (< U+10000), so `Char` iteration
+ * is sufficient — no surrogate-pair handling required.
+ */
+internal object ByteToUnicode {
+
+    /** `byteToUnicode[b]` is the `Char` representing byte `b`. */
+    val byteToUnicode: CharArray = buildByteToUnicode()
+
+    /** Reverse lookup: `Char` → original byte (0..255). */
+    val unicodeToByte: Map<Char, Byte> = buildUnicodeToByte(byteToUnicode)
+
+    private fun buildByteToUnicode(): CharArray {
+        val printable = mutableListOf<Int>()
+        for (b in '!'.code..'~'.code) printable.add(b)
+        for (b in '¡'.code..'¬'.code) printable.add(b)
+        for (b in '®'.code..'ÿ'.code) printable.add(b)
+
+        val printableSet = printable.toHashSet()
+        val result = CharArray(256)
+        for (b in printable) result[b] = b.toChar()
+
+        var next = 256
+        for (b in 0..255) {
+            if (b !in printableSet) {
+                result[b] = next.toChar()
+                next++
+            }
+        }
+        return result
+    }
+
+    private fun buildUnicodeToByte(forward: CharArray): Map<Char, Byte> {
+        val map = HashMap<Char, Byte>(256)
+        for (b in 0..255) map[forward[b]] = b.toByte()
+        return map
+    }
+
+    /** Encode a UTF-8 byte sequence to its byte-level BPE string form. */
+    fun encode(bytes: ByteArray): String {
+        val sb = StringBuilder(bytes.size)
+        for (b in bytes) sb.append(byteToUnicode[b.toInt() and 0xFF])
+        return sb.toString()
+    }
+
+    /** Decode a byte-level BPE string back to its UTF-8 byte sequence. */
+    fun decode(s: String): ByteArray {
+        val out = ByteArray(s.length)
+        for (i in s.indices) {
+            out[i] = unicodeToByte[s[i]]
+                ?: error("byte-level BPE string contained unmapped char: U+${s[i].code.toString(16)}")
+        }
+        return out
+    }
+}