Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## [Unreleased]

### Fixed
- **`/v1/embeddings` returned Ollama-native shape instead of OpenAI shape** — Ollama's `/api/embed` sends `application/json` responses with `transfer-encoding: chunked`, which `dispatch_request` incorrectly classified as streaming. This caused `proxy_handler` to receive a `StreamingResponse` rather than a `JSONResponse`, so the `isinstance` guard before `wrap_response` was never entered and the Ollama-native body passed through unwrapped. Fixed by removing `transfer-encoding: chunked` from the streaming heuristic — chunked TE is a transport-layer concern and is not a reliable indicator of application-level streaming. True streaming responses are identified solely by their content-type (`text/event-stream`, `application/x-ndjson`).

## [0.3.1] - 2026-06-16

### Fixed
Expand Down
10 changes: 6 additions & 4 deletions src/ollama_queue_proxy/proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,14 +214,16 @@ def _next_host() -> OllamaHost | None:
pass

# Check if this is a streaming response.
# Ollama uses application/x-ndjson for streaming generate/chat,
# text/event-stream for some endpoints, and application/json (chunked)
# for others. Treat any chunked transfer or ndjson content as streaming.
# Ollama uses application/x-ndjson for streaming generate/chat and
# text/event-stream for some endpoints. Chunked transfer-encoding is
# a transport concern and is NOT a reliable streaming indicator —
# /api/embed returns application/json with chunked TE even though the
# response is a single JSON object. Only the content-type identifies
# true streaming responses.
content_type = resp.headers.get("content-type", "")
is_streaming = (
"text/event-stream" in content_type
or "application/x-ndjson" in content_type
or resp.headers.get("transfer-encoding", "").lower() == "chunked"
)

response_headers = {
Expand Down
65 changes: 65 additions & 0 deletions tests/test_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,68 @@ async def test_non_streaming_response_content_length_correct():
assert response.headers["content-length"] != upstream_content_length, (
"upstream content-length (with trailing newline) must not bleed into response"
)


# ---------------------------------------------------------------------------
# OQP-4: /api/embed returns application/json with chunked TE — must be JSONResponse
# ---------------------------------------------------------------------------

@pytest.mark.asyncio
async def test_chunked_json_response_not_treated_as_streaming():
"""
dispatch_request must return JSONResponse (not StreamingResponse) when the
upstream sends application/json with transfer-encoding: chunked.
Ollama's /api/embed does this. Treating it as streaming caused proxy_handler's
isinstance(response, JSONResponse) check to fail, preventing the OpenAI compat
wrap_response call — the root cause of OQP-4.
"""
from fastapi import Request
from fastapi.responses import JSONResponse, StreamingResponse
from ollama_queue_proxy.proxy import dispatch_request
from ollama_queue_proxy.hosts import HostManager, OllamaHost
from tests.conftest import make_config

payload = {"embeddings": [[0.1, 0.2, 0.3]], "model": "bge-m3"}

mock_resp = MagicMock(spec=httpx.Response)
mock_resp.status_code = 200
mock_resp.headers = httpx.Headers({
"content-type": "application/json; charset=utf-8",
"transfer-encoding": "chunked",
})
mock_resp.json.return_value = payload

mock_client = AsyncMock(spec=httpx.AsyncClient)
mock_client.request = AsyncMock(return_value=mock_resp)

cfg = make_config()
host = OllamaHost(url="http://ollama-test:11434", name="test")
host.healthy = True
hm = HostManager.__new__(HostManager)
hm.hosts = [host]

scope = {
"type": "http",
"method": "POST",
"path": "/api/embed",
"query_string": b"",
"headers": [(b"content-type", b"application/json")],
}
request = Request(scope)
request.state.request_id = "test-oqp4"

response = await dispatch_request(
request=request,
body=json.dumps({"model": "bge-m3", "input": "test"}).encode(),
client_id=None,
config=cfg,
host_manager=hm,
client=mock_client,
)

assert isinstance(response, JSONResponse), (
"application/json response with transfer-encoding: chunked must return "
"JSONResponse so that proxy_handler can apply the OpenAI compat wrap"
)
assert not isinstance(response, StreamingResponse)
assert json.loads(response.body) == payload