From 9d495d948d277f66bafad7ca25b255eaf92ae4b3 Mon Sep 17 00:00:00 2001 From: Roy Peter D'Souza Date: Thu, 30 Apr 2026 07:38:18 -0700 Subject: [PATCH 1/3] Fix #3: Resolve multimodal BOA/EOA tokens from config.json instead of hardcoding --- Sources/SwiftLM/Server.swift | 37 ++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift index d1298ac..1b75838 100644 --- a/Sources/SwiftLM/Server.swift +++ b/Sources/SwiftLM/Server.swift @@ -3013,15 +3013,15 @@ public final class ALMModelFactory: ModelFactory, @unchecked Sendable { ) async throws -> ModelContext { let context = try await LLMModelFactory.shared._load(configuration: configuration, tokenizerLoader: tokenizerLoader) - let numAudioEmbeddings = OmniModelFactory.extractNumAudioEmbeddings(configuration: configuration) + let tokens = OmniModelFactory.extractMultimodalTokens(configuration: configuration) let messageGenerator = DefaultMessageGenerator() let processor = ALMUserInputProcessor( tokenizer: context.tokenizer, configuration: context.configuration, messageGenerator: messageGenerator, - boaToken: 255010, - eoaToken: 255011, - numAudioEmbeddings: numAudioEmbeddings + boaToken: tokens.boa, + eoaToken: tokens.eoa, + numAudioEmbeddings: tokens.numAudio ) return .init( @@ -3081,10 +3081,12 @@ public final class OmniModelFactory: ModelFactory, @unchecked Sendable { tokenizerLoader: any TokenizerLoader ) async throws -> ModelContext { let vlmContext = try await VLMModelFactory.shared._load(configuration: configuration, tokenizerLoader: tokenizerLoader) - let numAudioEmbeddings = OmniModelFactory.extractNumAudioEmbeddings(configuration: configuration) + let tokens = OmniModelFactory.extractMultimodalTokens(configuration: configuration) let omniProcessor = OmniUserInputProcessor( vlmProcessor: vlmContext.processor, - numAudioEmbeddings: numAudioEmbeddings + boaToken: tokens.boa, + eoaToken: tokens.eoa, + numAudioEmbeddings: tokens.numAudio ) return .init( @@ -3095,19 +3097,30 @@ public final class OmniModelFactory: ModelFactory, @unchecked Sendable { ) } - public static func extractNumAudioEmbeddings(configuration: ResolvedModelConfiguration) -> Int { + public static func extractMultimodalTokens(configuration: ResolvedModelConfiguration) -> (numAudio: Int, boa: Int, eoa: Int) { let configurationURL = configuration.modelDirectory.appending(component: "config.json") + var numAudio = 128 + var boa = 255010 + var eoa = 255011 + if let data = try? Data(contentsOf: configurationURL), let dict = try? JSONSerialization.jsonObject(with: data) as? [String: Any] { + // Extract num_audio_embeddings if let subsampling = dict["subsampling_conv_channels"] as? [Int] { - return subsampling.first ?? 128 - } - if let audioConfig = dict["audio_config"] as? [String: Any], + numAudio = subsampling.first ?? 128 + } else if let audioConfig = dict["audio_config"] as? [String: Any], let embeddings = audioConfig["num_audio_embeddings"] as? Int { - return embeddings + numAudio = embeddings } + + // Extract BOA/EOA tokens + if let b = dict["boa_token_id"] as? Int { boa = b } + else if let b = (dict["audio_config"] as? [String: Any])?["boa_token_id"] as? Int { boa = b } + + if let e = dict["eoa_token_id"] as? Int { eoa = e } + else if let e = (dict["audio_config"] as? [String: Any])?["eoa_token_id"] as? Int { eoa = e } } - return 128 + return (numAudio, boa, eoa) } } From 621a9310112841073c062dfd9442f09e5b677224 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 7 May 2026 17:32:06 -0700 Subject: [PATCH 2/3] test(swiftlm): Add tests for multimodal token extraction --- .../MultimodalTokenExtractionTests.swift | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 tests/SwiftLMTests/MultimodalTokenExtractionTests.swift diff --git a/tests/SwiftLMTests/MultimodalTokenExtractionTests.swift b/tests/SwiftLMTests/MultimodalTokenExtractionTests.swift new file mode 100644 index 0000000..e75f679 --- /dev/null +++ b/tests/SwiftLMTests/MultimodalTokenExtractionTests.swift @@ -0,0 +1,68 @@ +import XCTest +import Foundation +@testable import SwiftLM +import MLXLMCommon + +final class MultimodalTokenExtractionTests: XCTestCase { + + func testExtractMultimodalTokens_Defaults() throws { + let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString) + try FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true) + defer { try? FileManager.default.removeItem(at: tempDir) } + + let config = ModelConfiguration(directory: tempDir).resolved(modelDirectory: tempDir, tokenizerDirectory: tempDir) + + let tokens = OmniModelFactory.extractMultimodalTokens(configuration: config) + XCTAssertEqual(tokens.numAudio, 128) + XCTAssertEqual(tokens.boa, 255010) + XCTAssertEqual(tokens.eoa, 255011) + } + + func testExtractMultimodalTokens_FromConfig() throws { + let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString) + try FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true) + defer { try? FileManager.default.removeItem(at: tempDir) } + + let jsonDict: [String: Any] = [ + "subsampling_conv_channels": [256], + "boa_token_id": 999990, + "eoa_token_id": 999991 + ] + + let jsonData = try JSONSerialization.data(withJSONObject: jsonDict) + let configURL = tempDir.appendingPathComponent("config.json") + try jsonData.write(to: configURL) + + let config = ModelConfiguration(directory: tempDir).resolved(modelDirectory: tempDir, tokenizerDirectory: tempDir) + let tokens = OmniModelFactory.extractMultimodalTokens(configuration: config) + + XCTAssertEqual(tokens.numAudio, 256) + XCTAssertEqual(tokens.boa, 999990) + XCTAssertEqual(tokens.eoa, 999991) + } + + func testExtractMultimodalTokens_FromAudioConfigFallback() throws { + let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString) + try FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true) + defer { try? FileManager.default.removeItem(at: tempDir) } + + let jsonDict: [String: Any] = [ + "audio_config": [ + "num_audio_embeddings": 512, + "boa_token_id": 888880, + "eoa_token_id": 888881 + ] + ] + + let jsonData = try JSONSerialization.data(withJSONObject: jsonDict) + let configURL = tempDir.appendingPathComponent("config.json") + try jsonData.write(to: configURL) + + let config = ModelConfiguration(directory: tempDir).resolved(modelDirectory: tempDir, tokenizerDirectory: tempDir) + let tokens = OmniModelFactory.extractMultimodalTokens(configuration: config) + + XCTAssertEqual(tokens.numAudio, 512) + XCTAssertEqual(tokens.boa, 888880) + XCTAssertEqual(tokens.eoa, 888881) + } +} From 5cfc277fcf383d4bed613b9856a189c8a4696a06 Mon Sep 17 00:00:00 2001 From: Simba Date: Thu, 7 May 2026 20:23:35 -0700 Subject: [PATCH 3/3] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- Sources/SwiftLM/Server.swift | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift index 1b75838..ee70df7 100644 --- a/Sources/SwiftLM/Server.swift +++ b/Sources/SwiftLM/Server.swift @@ -3097,6 +3097,11 @@ public final class OmniModelFactory: ModelFactory, @unchecked Sendable { ) } + @available(*, deprecated, message: "Use extractMultimodalTokens(configuration:).numAudio instead") + public static func extractNumAudioEmbeddings(configuration: ResolvedModelConfiguration) -> Int { + extractMultimodalTokens(configuration: configuration).numAudio + } + public static func extractMultimodalTokens(configuration: ResolvedModelConfiguration) -> (numAudio: Int, boa: Int, eoa: Int) { let configurationURL = configuration.modelDirectory.appending(component: "config.json") var numAudio = 128