diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d228159..a2770840 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# 7.4.1 - 2025-12-19 + +fix: extract model from response for OpenAI stored prompts + +When using OpenAI stored prompts, the model is defined in the OpenAI dashboard rather than passed in the API request. This fix adds a fallback to extract the model from the response object when not provided in kwargs, ensuring generations show up with the correct model and enabling cost calculations. + # 7.4.0 - 2025-12-16 feat: Add automatic retries for feature flag requests diff --git a/posthog/ai/openai/openai.py b/posthog/ai/openai/openai.py index 11b3fe92..09984745 100644 --- a/posthog/ai/openai/openai.py +++ b/posthog/ai/openai/openai.py @@ -124,14 +124,23 @@ def _create_streaming( start_time = time.time() usage_stats: TokenUsage = TokenUsage() final_content = [] + model_from_response: Optional[str] = None response = self._original.create(**kwargs) def generator(): nonlocal usage_stats nonlocal final_content # noqa: F824 + nonlocal model_from_response try: for chunk in response: + # Extract model from response object in chunk (for stored prompts) + if hasattr(chunk, "response") and chunk.response: + if model_from_response is None and hasattr( + chunk.response, "model" + ): + model_from_response = chunk.response.model + # Extract usage stats from chunk chunk_usage = extract_openai_usage_from_chunk(chunk, "responses") @@ -161,6 +170,7 @@ def generator(): latency, output, None, # Responses API doesn't have tools + model_from_response, ) return generator() @@ -177,6 +187,7 @@ def _capture_streaming_event( latency: float, output: Any, available_tool_calls: Optional[List[Dict[str, Any]]] = None, + model_from_response: Optional[str] = None, ): from posthog.ai.types import StreamingEventData from posthog.ai.openai.openai_converter import ( @@ -189,9 +200,12 @@ def _capture_streaming_event( formatted_input = format_openai_streaming_input(kwargs, "responses") sanitized_input = sanitize_openai_response(formatted_input) + # Use model from kwargs, fallback to model from response + model = kwargs.get("model") or model_from_response or "unknown" + event_data = StreamingEventData( provider="openai", - model=kwargs.get("model", "unknown"), + model=model, base_url=str(self._client.base_url), kwargs=kwargs, formatted_input=sanitized_input, @@ -320,6 +334,7 @@ def _create_streaming( usage_stats: TokenUsage = TokenUsage() accumulated_content = [] accumulated_tool_calls: Dict[int, Dict[str, Any]] = {} + model_from_response: Optional[str] = None if "stream_options" not in kwargs: kwargs["stream_options"] = {} kwargs["stream_options"]["include_usage"] = True @@ -329,9 +344,14 @@ def generator(): nonlocal usage_stats nonlocal accumulated_content # noqa: F824 nonlocal accumulated_tool_calls + nonlocal model_from_response try: for chunk in response: + # Extract model from chunk (Chat Completions chunks have model field) + if model_from_response is None and hasattr(chunk, "model"): + model_from_response = chunk.model + # Extract usage stats from chunk chunk_usage = extract_openai_usage_from_chunk(chunk, "chat") @@ -376,6 +396,7 @@ def generator(): accumulated_content, tool_calls_list, extract_available_tool_calls("openai", kwargs), + model_from_response, ) return generator() @@ -393,6 +414,7 @@ def _capture_streaming_event( output: Any, tool_calls: Optional[List[Dict[str, Any]]] = None, available_tool_calls: Optional[List[Dict[str, Any]]] = None, + model_from_response: Optional[str] = None, ): from posthog.ai.types import StreamingEventData from posthog.ai.openai.openai_converter import ( @@ -405,9 +427,12 @@ def _capture_streaming_event( formatted_input = format_openai_streaming_input(kwargs, "chat") sanitized_input = sanitize_openai(formatted_input) + # Use model from kwargs, fallback to model from response + model = kwargs.get("model") or model_from_response or "unknown" + event_data = StreamingEventData( provider="openai", - model=kwargs.get("model", "unknown"), + model=model, base_url=str(self._client.base_url), kwargs=kwargs, formatted_input=sanitized_input, diff --git a/posthog/ai/openai/openai_async.py b/posthog/ai/openai/openai_async.py index 404895fc..77c9b260 100644 --- a/posthog/ai/openai/openai_async.py +++ b/posthog/ai/openai/openai_async.py @@ -128,14 +128,23 @@ async def _create_streaming( start_time = time.time() usage_stats: TokenUsage = TokenUsage() final_content = [] + model_from_response: Optional[str] = None response = await self._original.create(**kwargs) async def async_generator(): nonlocal usage_stats nonlocal final_content # noqa: F824 + nonlocal model_from_response try: async for chunk in response: + # Extract model from response object in chunk (for stored prompts) + if hasattr(chunk, "response") and chunk.response: + if model_from_response is None and hasattr( + chunk.response, "model" + ): + model_from_response = chunk.response.model + # Extract usage stats from chunk chunk_usage = extract_openai_usage_from_chunk(chunk, "responses") @@ -166,6 +175,7 @@ async def async_generator(): latency, output, extract_available_tool_calls("openai", kwargs), + model_from_response, ) return async_generator() @@ -182,13 +192,17 @@ async def _capture_streaming_event( latency: float, output: Any, available_tool_calls: Optional[List[Dict[str, Any]]] = None, + model_from_response: Optional[str] = None, ): if posthog_trace_id is None: posthog_trace_id = str(uuid.uuid4()) + # Use model from kwargs, fallback to model from response + model = kwargs.get("model") or model_from_response or "unknown" + event_properties = { "$ai_provider": "openai", - "$ai_model": kwargs.get("model"), + "$ai_model": model, "$ai_model_parameters": get_model_params(kwargs), "$ai_input": with_privacy_mode( self._client._ph_client, @@ -350,6 +364,7 @@ async def _create_streaming( usage_stats: TokenUsage = TokenUsage() accumulated_content = [] accumulated_tool_calls: Dict[int, Dict[str, Any]] = {} + model_from_response: Optional[str] = None if "stream_options" not in kwargs: kwargs["stream_options"] = {} @@ -360,9 +375,14 @@ async def async_generator(): nonlocal usage_stats nonlocal accumulated_content # noqa: F824 nonlocal accumulated_tool_calls + nonlocal model_from_response try: async for chunk in response: + # Extract model from chunk (Chat Completions chunks have model field) + if model_from_response is None and hasattr(chunk, "model"): + model_from_response = chunk.model + # Extract usage stats from chunk chunk_usage = extract_openai_usage_from_chunk(chunk, "chat") if chunk_usage: @@ -405,6 +425,7 @@ async def async_generator(): accumulated_content, tool_calls_list, extract_available_tool_calls("openai", kwargs), + model_from_response, ) return async_generator() @@ -422,13 +443,17 @@ async def _capture_streaming_event( output: Any, tool_calls: Optional[List[Dict[str, Any]]] = None, available_tool_calls: Optional[List[Dict[str, Any]]] = None, + model_from_response: Optional[str] = None, ): if posthog_trace_id is None: posthog_trace_id = str(uuid.uuid4()) + # Use model from kwargs, fallback to model from response + model = kwargs.get("model") or model_from_response or "unknown" + event_properties = { "$ai_provider": "openai", - "$ai_model": kwargs.get("model"), + "$ai_model": model, "$ai_model_parameters": get_model_params(kwargs), "$ai_input": with_privacy_mode( self._client._ph_client, diff --git a/posthog/ai/utils.py b/posthog/ai/utils.py index 559860cc..5f8a4b14 100644 --- a/posthog/ai/utils.py +++ b/posthog/ai/utils.py @@ -285,7 +285,7 @@ def call_llm_and_track_usage( event_properties = { "$ai_provider": provider, - "$ai_model": kwargs.get("model"), + "$ai_model": kwargs.get("model") or getattr(response, "model", None), "$ai_model_parameters": get_model_params(kwargs), "$ai_input": with_privacy_mode( ph_client, posthog_privacy_mode, sanitized_messages @@ -396,7 +396,7 @@ async def call_llm_and_track_usage_async( event_properties = { "$ai_provider": provider, - "$ai_model": kwargs.get("model"), + "$ai_model": kwargs.get("model") or getattr(response, "model", None), "$ai_model_parameters": get_model_params(kwargs), "$ai_input": with_privacy_mode( ph_client, posthog_privacy_mode, sanitized_messages diff --git a/posthog/test/ai/openai/test_openai.py b/posthog/test/ai/openai/test_openai.py index 07d45753..116ba2d1 100644 --- a/posthog/test/ai/openai/test_openai.py +++ b/posthog/test/ai/openai/test_openai.py @@ -1676,3 +1676,459 @@ async def chunk_iterable(): assert props["$ai_web_search_count"] == 1 assert props["$ai_input_tokens"] == 20 assert props["$ai_output_tokens"] == 15 + + +# Tests for model extraction fallback (stored prompts support) + + +def test_streaming_chat_extracts_model_from_chunk_when_not_in_kwargs(mock_client): + """Test that model is extracted from streaming chunks when not provided in kwargs (stored prompts).""" + + # Create streaming chunks with model field but we won't pass model in kwargs + chunks = [ + ChatCompletionChunk( + id="chunk1", + model="gpt-4o-stored-prompt", # Model comes from response, not request + object="chat.completion.chunk", + created=1234567890, + choices=[ + ChoiceChunk( + index=0, + delta=ChoiceDelta(role="assistant", content="Hello"), + finish_reason=None, + ) + ], + ), + ChatCompletionChunk( + id="chunk2", + model="gpt-4o-stored-prompt", + object="chat.completion.chunk", + created=1234567891, + choices=[ + ChoiceChunk( + index=0, + delta=ChoiceDelta(content=" world"), + finish_reason="stop", + ) + ], + usage=CompletionUsage( + prompt_tokens=10, + completion_tokens=5, + total_tokens=15, + ), + ), + ] + + with patch("openai.resources.chat.completions.Completions.create") as mock_create: + mock_create.return_value = chunks + + client = OpenAI(api_key="test-key", posthog_client=mock_client) + + # Note: NOT passing model in kwargs - simulates stored prompt usage + response_generator = client.chat.completions.create( + messages=[{"role": "user", "content": "Hello"}], + stream=True, + posthog_distinct_id="test-id", + ) + + # Consume the generator + list(response_generator) + + assert mock_client.capture.call_count == 1 + call_args = mock_client.capture.call_args[1] + props = call_args["properties"] + + # Model should be extracted from chunk, not kwargs + assert props["$ai_model"] == "gpt-4o-stored-prompt" + + +def test_streaming_chat_prefers_kwargs_model_over_chunk_model(mock_client): + """Test that model from kwargs takes precedence over model from chunk.""" + chunks = [ + ChatCompletionChunk( + id="chunk1", + model="gpt-4o-from-response", + object="chat.completion.chunk", + created=1234567890, + choices=[ + ChoiceChunk( + index=0, + delta=ChoiceDelta(role="assistant", content="Hello"), + finish_reason="stop", + ) + ], + usage=CompletionUsage( + prompt_tokens=10, + completion_tokens=5, + total_tokens=15, + ), + ), + ] + + with patch("openai.resources.chat.completions.Completions.create") as mock_create: + mock_create.return_value = chunks + + client = OpenAI(api_key="test-key", posthog_client=mock_client) + + response_generator = client.chat.completions.create( + model="gpt-4o-from-kwargs", # Explicitly passed model + messages=[{"role": "user", "content": "Hello"}], + stream=True, + posthog_distinct_id="test-id", + ) + + list(response_generator) + + call_args = mock_client.capture.call_args[1] + props = call_args["properties"] + + # kwargs model should take precedence + assert props["$ai_model"] == "gpt-4o-from-kwargs" + + +def test_streaming_responses_api_extracts_model_from_response_object(mock_client): + """Test that Responses API streaming extracts model from chunk.response.model (stored prompts).""" + from unittest.mock import MagicMock + from openai.types.responses import ResponseUsage + + chunks = [] + + # Content chunk + chunk1 = MagicMock() + chunk1.type = "response.text.delta" + chunk1.text = "Test response" + # No response attribute on content chunks + del chunk1.response + chunks.append(chunk1) + + # Final chunk with response object containing model + chunk2 = MagicMock() + chunk2.type = "response.completed" + chunk2.response = MagicMock() + chunk2.response.model = "gpt-4o-mini-stored" # Model from stored prompt + chunk2.response.usage = ResponseUsage( + input_tokens=20, + output_tokens=10, + total_tokens=30, + input_tokens_details={"prompt_tokens": 20, "cached_tokens": 0}, + output_tokens_details={"reasoning_tokens": 0}, + ) + chunk2.response.output = ["Test response"] + chunks.append(chunk2) + + with patch("openai.resources.responses.Responses.create") as mock_create: + mock_create.return_value = iter(chunks) + + client = OpenAI(api_key="test-key", posthog_client=mock_client) + + # Note: NOT passing model - simulates stored prompt + response_generator = client.responses.create( + input=[{"role": "user", "content": "Hello"}], + stream=True, + posthog_distinct_id="test-id", + ) + + list(response_generator) + + call_args = mock_client.capture.call_args[1] + props = call_args["properties"] + + # Model should be extracted from chunk.response.model + assert props["$ai_model"] == "gpt-4o-mini-stored" + + +def test_non_streaming_extracts_model_from_response(mock_client): + """Test that non-streaming calls extract model from response when not in kwargs.""" + # Create a response with model but we won't pass model in kwargs + mock_response = ChatCompletion( + id="test", + model="gpt-4o-stored-prompt", + object="chat.completion", + created=int(time.time()), + choices=[ + Choice( + finish_reason="stop", + index=0, + message=ChatCompletionMessage( + content="Test response", + role="assistant", + ), + ) + ], + usage=CompletionUsage( + completion_tokens=10, + prompt_tokens=20, + total_tokens=30, + ), + ) + + with patch( + "openai.resources.chat.completions.Completions.create", + return_value=mock_response, + ): + client = OpenAI(api_key="test-key", posthog_client=mock_client) + + # Note: NOT passing model in kwargs + response = client.chat.completions.create( + messages=[{"role": "user", "content": "Hello"}], + posthog_distinct_id="test-id", + ) + + assert response == mock_response + call_args = mock_client.capture.call_args[1] + props = call_args["properties"] + + # Model should be extracted from response.model + assert props["$ai_model"] == "gpt-4o-stored-prompt" + + +def test_non_streaming_responses_api_extracts_model_from_response(mock_client): + """Test that non-streaming Responses API extracts model from response when not in kwargs.""" + mock_response = Response( + id="test", + model="gpt-4o-mini-stored", + object="response", + created_at=1741476542, + status="completed", + error=None, + incomplete_details=None, + instructions=None, + max_output_tokens=None, + tools=[], + tool_choice="auto", + output=[ + ResponseOutputMessage( + id="msg_123", + type="message", + role="assistant", + status="completed", + content=[ + ResponseOutputText( + type="output_text", + text="Test response", + annotations=[], + ) + ], + ) + ], + parallel_tool_calls=True, + previous_response_id=None, + usage=ResponseUsage( + input_tokens=10, + output_tokens=10, + input_tokens_details={"prompt_tokens": 10, "cached_tokens": 0}, + output_tokens_details={"reasoning_tokens": 0}, + total_tokens=20, + ), + user=None, + metadata={}, + ) + + with patch( + "openai.resources.responses.Responses.create", + return_value=mock_response, + ): + client = OpenAI(api_key="test-key", posthog_client=mock_client) + + # Note: NOT passing model in kwargs + response = client.responses.create( + input="Hello", + posthog_distinct_id="test-id", + ) + + assert response == mock_response + call_args = mock_client.capture.call_args[1] + props = call_args["properties"] + + # Model should be extracted from response.model + assert props["$ai_model"] == "gpt-4o-mini-stored" + + +def test_non_streaming_returns_none_when_no_model(mock_client): + """Test that non-streaming returns None (not 'unknown') when model is not available anywhere.""" + # Create a response without model attribute using real OpenAI types + mock_response = ChatCompletion( + id="test", + model="", # Will be removed below + object="chat.completion", + created=int(time.time()), + choices=[ + Choice( + finish_reason="stop", + index=0, + message=ChatCompletionMessage( + content="Test response", + role="assistant", + ), + ) + ], + usage=CompletionUsage( + completion_tokens=5, + prompt_tokens=10, + total_tokens=15, + ), + ) + # Remove model attribute to simulate missing model + object.__delattr__(mock_response, "model") + + with patch( + "openai.resources.chat.completions.Completions.create", + return_value=mock_response, + ): + client = OpenAI(api_key="test-key", posthog_client=mock_client) + + # Note: NOT passing model in kwargs and response has no model + client.chat.completions.create( + messages=[{"role": "user", "content": "Hello"}], + posthog_distinct_id="test-id", + ) + + call_args = mock_client.capture.call_args[1] + props = call_args["properties"] + + # Should be None, NOT "unknown" (to avoid incorrect cost matching) + assert props["$ai_model"] is None + + +def test_streaming_falls_back_to_unknown_when_no_model(mock_client): + """Test that streaming falls back to 'unknown' when model is not available anywhere.""" + from unittest.mock import MagicMock + + # Create a chunk without model attribute + chunk = MagicMock() + chunk.choices = [MagicMock()] + chunk.choices[0].delta = MagicMock() + chunk.choices[0].delta.content = "Hello" + chunk.choices[0].delta.role = "assistant" + chunk.choices[0].delta.tool_calls = None + chunk.usage = CompletionUsage( + prompt_tokens=10, + completion_tokens=5, + total_tokens=15, + ) + # Explicitly remove model attribute + del chunk.model + + with patch("openai.resources.chat.completions.Completions.create") as mock_create: + mock_create.return_value = [chunk] + + client = OpenAI(api_key="test-key", posthog_client=mock_client) + + response_generator = client.chat.completions.create( + messages=[{"role": "user", "content": "Hello"}], + stream=True, + posthog_distinct_id="test-id", + ) + + list(response_generator) + + call_args = mock_client.capture.call_args[1] + props = call_args["properties"] + + # Should fall back to "unknown" + assert props["$ai_model"] == "unknown" + + +@pytest.mark.asyncio +async def test_async_streaming_chat_extracts_model_from_chunk(mock_client): + """Test async streaming extracts model from chunk when not in kwargs.""" + chunks = [ + ChatCompletionChunk( + id="chunk1", + model="gpt-4o-async-stored", + object="chat.completion.chunk", + created=1234567890, + choices=[ + ChoiceChunk( + index=0, + delta=ChoiceDelta(role="assistant", content="Hello"), + finish_reason="stop", + ) + ], + usage=CompletionUsage( + prompt_tokens=10, + completion_tokens=5, + total_tokens=15, + ), + ), + ] + + async def mock_create(self, **kwargs): + async def chunk_iterable(): + for chunk in chunks: + yield chunk + + return chunk_iterable() + + with patch( + "openai.resources.chat.completions.AsyncCompletions.create", new=mock_create + ): + client = AsyncOpenAI(api_key="test-key", posthog_client=mock_client) + + # Note: NOT passing model + response_stream = await client.chat.completions.create( + messages=[{"role": "user", "content": "Hello"}], + stream=True, + posthog_distinct_id="test-id", + ) + + async for _ in response_stream: + pass + + call_args = mock_client.capture.call_args[1] + props = call_args["properties"] + + assert props["$ai_model"] == "gpt-4o-async-stored" + + +@pytest.mark.asyncio +async def test_async_streaming_responses_extracts_model_from_response(mock_client): + """Test async Responses API streaming extracts model from chunk.response.model.""" + from unittest.mock import MagicMock + from openai.types.responses import ResponseUsage + + chunks = [] + + chunk1 = MagicMock() + chunk1.type = "response.text.delta" + chunk1.text = "Test" + del chunk1.response + chunks.append(chunk1) + + chunk2 = MagicMock() + chunk2.type = "response.completed" + chunk2.response = MagicMock() + chunk2.response.model = "gpt-4o-mini-async-stored" + chunk2.response.usage = ResponseUsage( + input_tokens=20, + output_tokens=10, + total_tokens=30, + input_tokens_details={"prompt_tokens": 20, "cached_tokens": 0}, + output_tokens_details={"reasoning_tokens": 0}, + ) + chunk2.response.output = ["Test"] + chunks.append(chunk2) + + async def mock_create(self, **kwargs): + async def chunk_iterable(): + for chunk in chunks: + yield chunk + + return chunk_iterable() + + with patch("openai.resources.responses.AsyncResponses.create", new=mock_create): + client = AsyncOpenAI(api_key="test-key", posthog_client=mock_client) + + response_stream = await client.responses.create( + input=[{"role": "user", "content": "Hello"}], + stream=True, + posthog_distinct_id="test-id", + ) + + async for _ in response_stream: + pass + + call_args = mock_client.capture.call_args[1] + props = call_args["properties"] + + assert props["$ai_model"] == "gpt-4o-mini-async-stored" diff --git a/posthog/version.py b/posthog/version.py index 3de0587a..10523a6e 100644 --- a/posthog/version.py +++ b/posthog/version.py @@ -1,4 +1,4 @@ -VERSION = "7.4.0" +VERSION = "7.4.1" if __name__ == "__main__": print(VERSION, end="") # noqa: T201