From 7a1dd1daf5cd574a1cfe3a22fba9f6abda84a5a0 Mon Sep 17 00:00:00 2001 From: developer-agent Date: Tue, 23 Jun 2026 09:21:47 -0400 Subject: [PATCH] =?UTF-8?q?fix:=20don't=20treat=20chunked=20TE=20as=20stre?= =?UTF-8?q?aming=20=E2=80=94=20fixes=20/v1/embeddings=20OpenAI=20wrap=20(O?= =?UTF-8?q?QP-4)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ollama's /api/embed sends application/json with transfer-encoding: chunked. dispatch_request classified any chunked response as streaming, returning a StreamingResponse. proxy_handler's isinstance(response, JSONResponse) guard then failed, so wrap_response was never called and the Ollama-native body passed through to callers expecting the OpenAI shape. Fix: remove transfer-encoding: chunked from the is_streaming heuristic. Chunked TE is a transport-layer mechanism; true streaming is identified by content-type (text/event-stream, application/x-ndjson). Adds regression test: test_chunked_json_response_not_treated_as_streaming. Co-Authored-By: Claude Sonnet 4.6 agent-id: developer --- CHANGELOG.md | 3 ++ src/ollama_queue_proxy/proxy.py | 10 +++-- tests/test_proxy.py | 65 +++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 219264e..9ab1494 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ ## [Unreleased] +### Fixed +- **`/v1/embeddings` returned Ollama-native shape instead of OpenAI shape** — Ollama's `/api/embed` sends `application/json` responses with `transfer-encoding: chunked`, which `dispatch_request` incorrectly classified as streaming. This caused `proxy_handler` to receive a `StreamingResponse` rather than a `JSONResponse`, so the `isinstance` guard before `wrap_response` was never entered and the Ollama-native body passed through unwrapped. Fixed by removing `transfer-encoding: chunked` from the streaming heuristic — chunked TE is a transport-layer concern and is not a reliable indicator of application-level streaming. True streaming responses are identified solely by their content-type (`text/event-stream`, `application/x-ndjson`). + ## [0.3.1] - 2026-06-16 ### Fixed diff --git a/src/ollama_queue_proxy/proxy.py b/src/ollama_queue_proxy/proxy.py index 2f53a5b..f9a4b89 100644 --- a/src/ollama_queue_proxy/proxy.py +++ b/src/ollama_queue_proxy/proxy.py @@ -214,14 +214,16 @@ def _next_host() -> OllamaHost | None: pass # Check if this is a streaming response. - # Ollama uses application/x-ndjson for streaming generate/chat, - # text/event-stream for some endpoints, and application/json (chunked) - # for others. Treat any chunked transfer or ndjson content as streaming. + # Ollama uses application/x-ndjson for streaming generate/chat and + # text/event-stream for some endpoints. Chunked transfer-encoding is + # a transport concern and is NOT a reliable streaming indicator — + # /api/embed returns application/json with chunked TE even though the + # response is a single JSON object. Only the content-type identifies + # true streaming responses. content_type = resp.headers.get("content-type", "") is_streaming = ( "text/event-stream" in content_type or "application/x-ndjson" in content_type - or resp.headers.get("transfer-encoding", "").lower() == "chunked" ) response_headers = { diff --git a/tests/test_proxy.py b/tests/test_proxy.py index 0b1b0d5..cf36288 100644 --- a/tests/test_proxy.py +++ b/tests/test_proxy.py @@ -113,3 +113,68 @@ async def test_non_streaming_response_content_length_correct(): assert response.headers["content-length"] != upstream_content_length, ( "upstream content-length (with trailing newline) must not bleed into response" ) + + +# --------------------------------------------------------------------------- +# OQP-4: /api/embed returns application/json with chunked TE — must be JSONResponse +# --------------------------------------------------------------------------- + +@pytest.mark.asyncio +async def test_chunked_json_response_not_treated_as_streaming(): + """ + dispatch_request must return JSONResponse (not StreamingResponse) when the + upstream sends application/json with transfer-encoding: chunked. + Ollama's /api/embed does this. Treating it as streaming caused proxy_handler's + isinstance(response, JSONResponse) check to fail, preventing the OpenAI compat + wrap_response call — the root cause of OQP-4. + """ + from fastapi import Request + from fastapi.responses import JSONResponse, StreamingResponse + from ollama_queue_proxy.proxy import dispatch_request + from ollama_queue_proxy.hosts import HostManager, OllamaHost + from tests.conftest import make_config + + payload = {"embeddings": [[0.1, 0.2, 0.3]], "model": "bge-m3"} + + mock_resp = MagicMock(spec=httpx.Response) + mock_resp.status_code = 200 + mock_resp.headers = httpx.Headers({ + "content-type": "application/json; charset=utf-8", + "transfer-encoding": "chunked", + }) + mock_resp.json.return_value = payload + + mock_client = AsyncMock(spec=httpx.AsyncClient) + mock_client.request = AsyncMock(return_value=mock_resp) + + cfg = make_config() + host = OllamaHost(url="http://ollama-test:11434", name="test") + host.healthy = True + hm = HostManager.__new__(HostManager) + hm.hosts = [host] + + scope = { + "type": "http", + "method": "POST", + "path": "/api/embed", + "query_string": b"", + "headers": [(b"content-type", b"application/json")], + } + request = Request(scope) + request.state.request_id = "test-oqp4" + + response = await dispatch_request( + request=request, + body=json.dumps({"model": "bge-m3", "input": "test"}).encode(), + client_id=None, + config=cfg, + host_manager=hm, + client=mock_client, + ) + + assert isinstance(response, JSONResponse), ( + "application/json response with transfer-encoding: chunked must return " + "JSONResponse so that proxy_handler can apply the OpenAI compat wrap" + ) + assert not isinstance(response, StreamingResponse) + assert json.loads(response.body) == payload