Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions src/api/providers/fetchers/__tests__/ollama.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,123 @@ describe("Ollama Fetcher", () => {
expect(parsedModel!.supportsImages).toBe(true)
expect(parsedModel!.contextWindow).toBeGreaterThan(0)
})

it("should detect vision via details.families when capabilities omits vision", () => {
const modelData = {
...ollamaModelsData["qwen3-2to16:latest"],
details: {
...ollamaModelsData["qwen3-2to16:latest"].details,
families: ["gemma4", "clip"],
},
capabilities: ["completion", "tools"], // no "vision"
}

const parsedModel = parseOllamaModel(modelData as any)

expect(parsedModel).not.toBeNull()
expect(parsedModel!.supportsImages).toBe(true)
})

it("should detect vision via model_info keys when capabilities and families lack vision indicators", () => {
const modelData = {
...ollamaModelsData["qwen3-2to16:latest"],
details: {
...ollamaModelsData["qwen3-2to16:latest"].details,
families: ["gemma4"],
},
model_info: {
...ollamaModelsData["qwen3-2to16:latest"].model_info,
"gemma4_vision_encoder.block_count": 27,
"gemma4_vision_encoder.embedding_length": 1152,
},
capabilities: ["completion", "tools"], // no "vision"
}

const parsedModel = parseOllamaModel(modelData as any)

expect(parsedModel).not.toBeNull()
expect(parsedModel!.supportsImages).toBe(true)
})

it("should detect vision via siglip family in details.families", () => {
const modelData = {
...ollamaModelsData["qwen3-2to16:latest"],
details: {
...ollamaModelsData["qwen3-2to16:latest"].details,
families: ["gemma4", "siglip"],
},
capabilities: ["completion", "tools"],
}

const parsedModel = parseOllamaModel(modelData as any)

expect(parsedModel).not.toBeNull()
expect(parsedModel!.supportsImages).toBe(true)
})

it("should detect vision via mmproj family in details.families", () => {
const modelData = {
...ollamaModelsData["qwen3-2to16:latest"],
details: {
...ollamaModelsData["qwen3-2to16:latest"].details,
families: ["llama", "mmproj"],
},
capabilities: ["completion", "tools"],
}

const parsedModel = parseOllamaModel(modelData as any)

expect(parsedModel).not.toBeNull()
expect(parsedModel!.supportsImages).toBe(true)
})

it("should detect vision via mllama family in details.families", () => {
const modelData = {
...ollamaModelsData["qwen3-2to16:latest"],
details: {
...ollamaModelsData["qwen3-2to16:latest"].details,
families: ["llama", "mllama"],
},
capabilities: ["completion", "tools"],
}

const parsedModel = parseOllamaModel(modelData as any)

expect(parsedModel).not.toBeNull()
expect(parsedModel!.supportsImages).toBe(true)
})

it("should not detect vision when no indicators are present", () => {
const modelData = {
...ollamaModelsData["qwen3-2to16:latest"],
details: {
...ollamaModelsData["qwen3-2to16:latest"].details,
families: ["qwen3"],
},
capabilities: ["completion", "tools"],
}

const parsedModel = parseOllamaModel(modelData as any)

expect(parsedModel).not.toBeNull()
expect(parsedModel!.supportsImages).toBe(false)
})

it("should handle case-insensitive family matching for vision detection", () => {
const modelData = {
...ollamaModelsData["qwen3-2to16:latest"],
details: {
...ollamaModelsData["qwen3-2to16:latest"].details,
families: ["gemma4", "CLIP"],
},
capabilities: ["completion", "tools"],
}

const parsedModel = parseOllamaModel(modelData as any)

expect(parsedModel).not.toBeNull()
expect(parsedModel!.supportsImages).toBe(true)
})
})

describe("getOllamaModels", () => {
Expand Down
42 changes: 41 additions & 1 deletion src/api/providers/fetchers/ollama.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,46 @@ type OllamaModelsResponse = z.infer<typeof OllamaModelsResponseSchema>

type OllamaModelInfoResponse = z.infer<typeof OllamaModelInfoResponseSchema>

/**
* Known vision-related family names that appear in `details.families` for
* multimodal models in Ollama. When a model's `capabilities` array omits
* "vision" (as happens with some third-party quants like unsloth), we fall
* back to checking these families.
*/
const VISION_FAMILIES = new Set(["clip", "siglip", "mmproj", "mllama"])

/**
* Regex patterns matched against `model_info` keys to detect a vision
* encoder even when `capabilities` and `details.families` are both silent.
*/
const VISION_MODEL_INFO_PATTERN = /vision|clip|siglip|mmproj|image_encoder/i

/**
* Determines whether the model supports images by checking:
* 1. The authoritative `capabilities` array (preferred).
* 2. `details.families` for known vision encoder families.
* 3. `model_info` keys for vision-related architecture indicators.
*/
const detectVisionSupport = (rawModel: OllamaModelInfoResponse): boolean => {
// 1. Authoritative check
if (rawModel.capabilities?.includes("vision")) {
return true
}

// 2. Families check
const families = rawModel.details.families
if (families?.some((f) => VISION_FAMILIES.has(f.toLowerCase()))) {
return true
}

// 3. model_info key check
if (Object.keys(rawModel.model_info).some((k) => VISION_MODEL_INFO_PATTERN.test(k))) {
return true
}

return false
}

export const parseOllamaModel = (rawModel: OllamaModelInfoResponse): ModelInfo | null => {
const contextKey = Object.keys(rawModel.model_info).find((k) => k.includes("context_length"))
const contextWindow =
Expand All @@ -52,7 +92,7 @@ export const parseOllamaModel = (rawModel: OllamaModelInfoResponse): ModelInfo |
description: `Family: ${rawModel.details.family}, Context: ${contextWindow}, Size: ${rawModel.details.parameter_size}`,
contextWindow: contextWindow || ollamaDefaultModelInfo.contextWindow,
supportsPromptCache: true,
supportsImages: rawModel.capabilities?.includes("vision"),
supportsImages: detectVisionSupport(rawModel),
maxTokens: contextWindow || ollamaDefaultModelInfo.contextWindow,
})

Expand Down
Loading