From 7a1dd1daf5cd574a1cfe3a22fba9f6abda84a5a0 Mon Sep 17 00:00:00 2001
From: developer-agent <developer@forge>
Date: Tue, 23 Jun 2026 09:21:47 -0400
Subject: [PATCH] =?UTF-8?q?fix:=20don't=20treat=20chunked=20TE=20as=20stre?=
 =?UTF-8?q?aming=20=E2=80=94=20fixes=20/v1/embeddings=20OpenAI=20wrap=20(O?=
 =?UTF-8?q?QP-4)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ollama's /api/embed sends application/json with transfer-encoding: chunked.
dispatch_request classified any chunked response as streaming, returning a
StreamingResponse. proxy_handler's isinstance(response, JSONResponse) guard
then failed, so wrap_response was never called and the Ollama-native body
passed through to callers expecting the OpenAI shape.

Fix: remove transfer-encoding: chunked from the is_streaming heuristic.
Chunked TE is a transport-layer mechanism; true streaming is identified by
content-type (text/event-stream, application/x-ndjson).

Adds regression test: test_chunked_json_response_not_treated_as_streaming.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

agent-id: developer
---
 CHANGELOG.md                    |  3 ++
 src/ollama_queue_proxy/proxy.py | 10 +++--
 tests/test_proxy.py             | 65 +++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 219264e..9ab1494 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ## [Unreleased]
 
+### Fixed
+- **`/v1/embeddings` returned Ollama-native shape instead of OpenAI shape** — Ollama's `/api/embed` sends `application/json` responses with `transfer-encoding: chunked`, which `dispatch_request` incorrectly classified as streaming. This caused `proxy_handler` to receive a `StreamingResponse` rather than a `JSONResponse`, so the `isinstance` guard before `wrap_response` was never entered and the Ollama-native body passed through unwrapped. Fixed by removing `transfer-encoding: chunked` from the streaming heuristic — chunked TE is a transport-layer concern and is not a reliable indicator of application-level streaming. True streaming responses are identified solely by their content-type (`text/event-stream`, `application/x-ndjson`).
+
 ## [0.3.1] - 2026-06-16
 
 ### Fixed
diff --git a/src/ollama_queue_proxy/proxy.py b/src/ollama_queue_proxy/proxy.py
index 2f53a5b..f9a4b89 100644
--- a/src/ollama_queue_proxy/proxy.py
+++ b/src/ollama_queue_proxy/proxy.py
@@ -214,14 +214,16 @@ def _next_host() -> OllamaHost | None:
                     pass
 
             # Check if this is a streaming response.
-            # Ollama uses application/x-ndjson for streaming generate/chat,
-            # text/event-stream for some endpoints, and application/json (chunked)
-            # for others. Treat any chunked transfer or ndjson content as streaming.
+            # Ollama uses application/x-ndjson for streaming generate/chat and
+            # text/event-stream for some endpoints. Chunked transfer-encoding is
+            # a transport concern and is NOT a reliable streaming indicator —
+            # /api/embed returns application/json with chunked TE even though the
+            # response is a single JSON object. Only the content-type identifies
+            # true streaming responses.
             content_type = resp.headers.get("content-type", "")
             is_streaming = (
                 "text/event-stream" in content_type
                 or "application/x-ndjson" in content_type
-                or resp.headers.get("transfer-encoding", "").lower() == "chunked"
             )
 
             response_headers = {
diff --git a/tests/test_proxy.py b/tests/test_proxy.py
index 0b1b0d5..cf36288 100644
--- a/tests/test_proxy.py
+++ b/tests/test_proxy.py
@@ -113,3 +113,68 @@ async def test_non_streaming_response_content_length_correct():
     assert response.headers["content-length"] != upstream_content_length, (
         "upstream content-length (with trailing newline) must not bleed into response"
     )
+
+
+# ---------------------------------------------------------------------------
+# OQP-4: /api/embed returns application/json with chunked TE — must be JSONResponse
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_chunked_json_response_not_treated_as_streaming():
+    """
+    dispatch_request must return JSONResponse (not StreamingResponse) when the
+    upstream sends application/json with transfer-encoding: chunked.
+    Ollama's /api/embed does this. Treating it as streaming caused proxy_handler's
+    isinstance(response, JSONResponse) check to fail, preventing the OpenAI compat
+    wrap_response call — the root cause of OQP-4.
+    """
+    from fastapi import Request
+    from fastapi.responses import JSONResponse, StreamingResponse
+    from ollama_queue_proxy.proxy import dispatch_request
+    from ollama_queue_proxy.hosts import HostManager, OllamaHost
+    from tests.conftest import make_config
+
+    payload = {"embeddings": [[0.1, 0.2, 0.3]], "model": "bge-m3"}
+
+    mock_resp = MagicMock(spec=httpx.Response)
+    mock_resp.status_code = 200
+    mock_resp.headers = httpx.Headers({
+        "content-type": "application/json; charset=utf-8",
+        "transfer-encoding": "chunked",
+    })
+    mock_resp.json.return_value = payload
+
+    mock_client = AsyncMock(spec=httpx.AsyncClient)
+    mock_client.request = AsyncMock(return_value=mock_resp)
+
+    cfg = make_config()
+    host = OllamaHost(url="http://ollama-test:11434", name="test")
+    host.healthy = True
+    hm = HostManager.__new__(HostManager)
+    hm.hosts = [host]
+
+    scope = {
+        "type": "http",
+        "method": "POST",
+        "path": "/api/embed",
+        "query_string": b"",
+        "headers": [(b"content-type", b"application/json")],
+    }
+    request = Request(scope)
+    request.state.request_id = "test-oqp4"
+
+    response = await dispatch_request(
+        request=request,
+        body=json.dumps({"model": "bge-m3", "input": "test"}).encode(),
+        client_id=None,
+        config=cfg,
+        host_manager=hm,
+        client=mock_client,
+    )
+
+    assert isinstance(response, JSONResponse), (
+        "application/json response with transfer-encoding: chunked must return "
+        "JSONResponse so that proxy_handler can apply the OpenAI compat wrap"
+    )
+    assert not isinstance(response, StreamingResponse)
+    assert json.loads(response.body) == payload