From 9859d087f53955e38fb085c5a33a8643bcee7acb Mon Sep 17 00:00:00 2001 From: Andrew Klatzke Date: Wed, 25 Mar 2026 17:09:34 -0800 Subject: [PATCH 01/11] feat: implements optimize method in SDK, code moved --- .../src/ldai_optimization/__init__.py | 4 +- .../src/ldai_optimization/client.py | 808 +++++++++++++++++- .../src/ldai_optimization/dataclasses.py | 238 ++++++ .../src/ldai_optimization/util.py | 201 +++++ packages/optimization/tests/test_package.py | 4 +- 5 files changed, 1239 insertions(+), 16 deletions(-) create mode 100644 packages/optimization/src/ldai_optimization/dataclasses.py create mode 100644 packages/optimization/src/ldai_optimization/util.py diff --git a/packages/optimization/src/ldai_optimization/__init__.py b/packages/optimization/src/ldai_optimization/__init__.py index 7df6ed3..6319dee 100644 --- a/packages/optimization/src/ldai_optimization/__init__.py +++ b/packages/optimization/src/ldai_optimization/__init__.py @@ -3,11 +3,11 @@ This package will provide helpers to run selected tools against the LaunchDarkly API from SDK-based workflows. """ -from ldai_optimization.client import ApiAgentOptimizationClient +from ldai_optimization.client import OptimizationClient __version__ = "0.0.0" __all__ = [ '__version__', - 'ApiAgentOptimizationClient', + 'OptimizationClient', ] diff --git a/packages/optimization/src/ldai_optimization/client.py b/packages/optimization/src/ldai_optimization/client.py index 75c3858..b4d4872 100644 --- a/packages/optimization/src/ldai_optimization/client.py +++ b/packages/optimization/src/ldai_optimization/client.py @@ -1,20 +1,804 @@ -"""Client placeholder for LaunchDarkly API tool execution.""" +"""Client for LaunchDarkly AI agent optimization.""" -from typing import Any, Dict +from typing import Any, Dict, List, Optional +import dataclasses +import os +import logging +import random +import json +from ldai import LDAIClient, AIJudgeConfigDefault, AIAgentConfig -class ApiAgentOptimizationClient: - """Coordinates running supported tools against the LaunchDarkly API. +from ldai_optimization.dataclasses import ( + AutoCommitConfig, + JudgeResult, + Message, + OptimizationContext, + OptimizationJudge, + OptimizationJudgeContext, + OptimizationOptions, +) +from ldai_optimization.util import ( + await_if_needed, + create_evaluation_tool, + create_variation_tool, + extract_json_from_response, +) - This type is scaffolding; concrete behavior will be added in a future release. - """ +logger = logging.getLogger(__name__) - def optimize(self, tool_name: str, parameters: Dict[str, Any]) -> Any: - """Execute a supported LaunchDarkly API tool by name. - :param tool_name: Identifier of the tool to invoke. - :param parameters: Tool-specific request parameters. - :return: Tool-specific response data. - :raises NotImplementedError: Until the API integration is implemented. +class OptimizationClient: + _options: OptimizationOptions + _ldClient: LDAIClient + _has_api_key: bool + _api_key: Optional[str] + _agent_key: str + + def __init__(self, ldClient: LDAIClient) -> None: + self._ldClient = ldClient + + if os.environ.get("LAUNCHDARKLY_API_KEY"): + self._has_api_key = True + self._api_key = os.environ.get("LAUNCHDARKLY_API_KEY") + else: + self._has_api_key = False + self._api_key = None + logger.warning( + "LAUNCHDARKLY_API_KEY is not set, functionality will be limited" + ) + + def _initialize_class_members_from_config( + self, agent_config: AIAgentConfig + ) -> None: + self._current_instructions = agent_config.instructions or "" + self._current_parameters: Dict[str, Any] = agent_config.model._parameters or {} + self._current_model: Optional[str] = ( + agent_config.model.name if agent_config.model else None + ) + self._history: List[OptimizationContext] = [] + + def _create_optimization_context( + self, + iteration: int, + user_input: Optional[str] = None, + completion_response: str = "", + scores: Optional[Dict[str, JudgeResult]] = None, + ) -> OptimizationContext: + """ + Create an OptimizeContext with current state. + + :param iteration: Current iteration number + :param user_input: Optional user input for this iteration + :param completion_response: Completion response string + :param scores: Optional dictionary of judge results + :return: A new OptimizeContext instance + """ + flat_history = [prev_ctx.copy_without_history() for prev_ctx in self._history] + return OptimizationContext( + scores=scores or {}, + completion_response=completion_response, + current_instructions=self._current_instructions, + current_parameters=self._current_parameters.copy(), + current_model=self._current_model, + user_input=user_input, + history=tuple(flat_history), + iteration=iteration, + ) + + def _safe_status_update( + self, status: str, context: OptimizationContext, iteration: int + ) -> None: + """ + Safely call on_status_update callback, catching and logging errors. + + :param status: The status string to pass to the callback + :param context: The optimization context to pass to the callback + :param iteration: Current iteration number for logging + """ + if self._options.on_status_update: + try: + self._options.on_status_update(status, context.copy_without_history()) + except Exception as e: + logger.exception( + "[Turn %d] -> on_status_update callback failed", iteration + ) + + async def _call_judges( + self, completion_response: str, iteration: int + ) -> Dict[str, JudgeResult]: + """ + Call all judges in parallel (auto-path). + + For judges with judge_key: Fetches judge config on-demand from LaunchDarkly SDK. + For judges with acceptance_statement: Uses handle_judge_call callback. + + :param completion_response: The agent's completion response to evaluate + :param iteration: Current iteration number + :return: Dictionary of judge results (score and rationale) + """ + if not self._options.judges: + return {} + + logger.info("[Turn %d] -> Executing evaluation...", iteration) + reasoning_history = self._build_reasoning_history() + judge_results: Dict[str, JudgeResult] = {} + + for judge_key, optimization_judge in self._options.judges.items(): + try: + if optimization_judge.judge_key is not None: + result = await self._evaluate_config_judge( + judge_key, + optimization_judge, + completion_response, + iteration, + reasoning_history, + ) + judge_results[judge_key] = result + else: + result = await self._evaluate_acceptance_judge( + judge_key, + optimization_judge, + completion_response, + iteration, + reasoning_history, + ) + judge_results[judge_key] = result + except Exception as e: + logger.exception( + "[Turn %d] -> Judge %s evaluation failed", iteration, judge_key + ) + judge_results[judge_key] = JudgeResult(score=0.0, rationale=None) + + judge_results_json = self._serialize_scores(judge_results) + logger.info( + "[Turn %d] -> Evaluation result: %s", + iteration, + json.dumps(judge_results_json, indent=2), + ) + return judge_results + + async def _evaluate_config_judge( + self, + judge_key: str, + optimization_judge: "OptimizationJudge", + completion_response: str, + iteration: int, + reasoning_history: str, + ) -> JudgeResult: + """ + Evaluate using a config-type judge (with judge_key). + + :param judge_key: The key for this judge in the judges dict + :param optimization_judge: The optimization judge configuration + :param completion_response: The agent's completion response to evaluate + :param iteration: Current iteration number + :param reasoning_history: Formatted string of reasoning from previous iterations + :return: The judge result with score and rationale + """ + # Config-type judge: fetch judge config on-demand from LaunchDarkly SDK + input_text = self._current_instructions or "" + # Combine current instructions with reasoning history for message_history + message_history_text = self._build_message_history_text( + input_text, reasoning_history + ) + + judge_config = self._judge_config( + optimization_judge.judge_key, + self._options.context_choices[0], + AIJudgeConfigDefault(enabled=False), + { + "message_history": message_history_text, + "response_to_evaluate": completion_response, + }, + ) + + if not judge_config.enabled: + logger.warning( + "[Turn %d] -> Judge %s is disabled", + iteration, + optimization_judge.judge_key, + ) + return JudgeResult(score=0.0, rationale=None) + + if not judge_config.messages: + logger.warning( + "[Turn %d] -> Judge %s has no messages", + iteration, + optimization_judge.judge_key, + ) + return JudgeResult(score=0.0, rationale=None) + + # Convert LDMessage to Message objects, appending structured output instruction to system messages + judge_messages = [] + for msg in judge_config.messages: + content = msg.content + if msg.role == "system": + content += " Use the structured output tool to format your response. You should always return a JSON object with a score and rationale." + judge_messages.append(Message(role=msg.role, content=content)) + + # Build parameters from judge config, hoisting any pre-existing tools so we can append ours + parameters = {} + tools = [] + if judge_config.model: + parameters["model"] = judge_config.model.name + if judge_config.model._parameters: + # Extract tools if present + existing_tools = judge_config.model._parameters.get("tools") + if existing_tools: + tools = ( + existing_tools + if isinstance(existing_tools, list) + else [existing_tools] + ) + # Convert to dicts if needed + tools = [ + tool.to_dict() if hasattr(tool, "to_dict") else tool + for tool in tools + ] + # Copy parameters excluding tools + parameters.update( + { + k: v + for k, v in judge_config.model._parameters.items() + if k != "tools" + } + ) + + # Add structured output tool for score and rationale + tools.append(create_evaluation_tool().to_dict()) + + judge_ctx = OptimizationJudgeContext( + messages=judge_messages, + parameters=parameters, + tools=tools, + ) + + result = self._options.handle_judge_call(self._options.judge_model, judge_ctx) + judge_response_str = await await_if_needed(result) + + logger.info( + "[Turn %d] -> Judge response (%s): %s", + iteration, + judge_key, + judge_response_str, + ) + + # Parse judge response — expect structured JSON output + judge_identifier = optimization_judge.judge_key or judge_key + return self._parse_judge_response( + judge_response_str, + judge_key, + judge_identifier, + iteration, + clamp_score=False, + ) + + async def _evaluate_acceptance_judge( + self, + judge_key: str, + optimization_judge: "OptimizationJudge", + completion_response: str, + iteration: int, + reasoning_history: str, + ) -> JudgeResult: + """ + Evaluate using an acceptance statement judge. + + :param judge_key: The key for this judge in the judges dict + :param optimization_judge: The optimization judge configuration + :param completion_response: The agent's completion response to evaluate + :param iteration: Current iteration number + :param reasoning_history: Formatted string of reasoning from previous iterations + :return: The judge result with score and rationale + """ + if not optimization_judge.acceptance_statement: + logger.error( + "[Turn %d] -> Judge %s has no acceptance_statement", + iteration, + judge_key, + ) + return JudgeResult(score=0.0, rationale=None) + + # Build message history with reasoning for context + message_history_text = self._build_message_history_text("", reasoning_history) + + # Build judge context for LLM call + judge_messages = [ + Message( + role="system", + content=f"""You are a judge that evaluates the response to the user's question. + + Here is the statement that you should evaluate the response against: '{optimization_judge.acceptance_statement}' + Here is the history of all messages between the user and the assistant: {message_history_text} + You should score the response based on how well it meets the acceptance statement using a score between 0.0 and 1.0. + A score of 0.0 means it does not match at all, while a score of 1.0 means it matches perfectly. + A score of 0.3-0.7 means it matches partially, while a score of 0.7-1.0 means it matches well. + A score of 0.0-0.3 means that it does not match well at all. You can return any value between 0.0 and 1.0. + You should also provide a rationale for your score. + You should call the structured output tool to format your response. + + Here is an example of a good response: + {{ + "score": 0.8, + "rationale": "The response matches the acceptance statement well. It provides a detailed explanation of the concept and its applications." + }} + """, + ), + Message( + role="user", + content=f"Here is the response to evaluate: {completion_response}", + ), + ] + + # Create structured output tool for evaluation response with score and rationale + evaluation_tool = create_evaluation_tool() + + judge_ctx = OptimizationJudgeContext( + messages=judge_messages, + parameters={"model": self._options.judge_model}, + tools=[evaluation_tool.to_dict()], + ) + + result = self._options.handle_judge_call(self._options.judge_model, judge_ctx) + judge_response = await await_if_needed(result) + + logger.info( + "[Turn %d] -> Judge response (%s): %s", iteration, judge_key, judge_response + ) + + # Parse judge response — expect structured JSON output with score and rationale + return self._parse_judge_response( + judge_response, judge_key, judge_key, iteration, clamp_score=True + ) + + async def _get_agent_config(self, agent_key: str) -> AIAgentConfig: + """ + Get the agent configuration from the LaunchDarkly client. + + :param agent_key: The key for the agent to get the configuration for + :return: The agent configuration + """ + try: + agent_config = await self._ldClient.agent_config(agent_key) + self._initialize_class_members(agent_config) + return agent_config + except Exception as e: + logger.exception("[Optimization] -> Failed to get agent configuration") + raise + + async def optimize_from_options( + self, agent_key: str, options: OptimizationOptions + ) -> Any: + """Execute an optimization on the given agent with the given options. + + :param agent_key: Identifier of the agent to optimize. + :param options: Optimization options. + :return: Optimization result. + """ + self._agent_key = agent_key + agent_config = await self._get_agent_config(agent_key) + return await self._run_optimization(agent_config, options) + + def _build_new_variation_prompt( + self, previous_ctx: Optional[OptimizationContext] + ) -> str: + """ + Build the LLM prompt for generating an improved agent configuration. + + Constructs a detailed instruction string based on the previous iteration's + configuration, completion result, and judge scores. When no previous context + exists (first variation attempt), asks the LLM to improve the current config + without evaluation feedback. + + :param previous_ctx: The most recent OptimizationContext, or None on the first attempt + :return: The assembled prompt string + """ + sections = [ + self._new_variation_prompt_preamble(), + self._new_variation_prompt_configuration(previous_ctx), + self._new_variation_prompt_feedback(previous_ctx), + self._new_variation_prompt_improvement_instructions(previous_ctx), + ] + return "\n\n".join(s for s in sections if s) + + def _new_variation_prompt_preamble(self) -> str: + """Static opening section for the variation generation prompt.""" + return "\n".join([ + "You are an assistant that helps improve agent configurations through iterative optimization.", + "", + "Your task is to generate improved agent instructions and parameters based on the feedback provided.", + ]) + + def _new_variation_prompt_configuration( + self, previous_ctx: Optional[OptimizationContext] + ) -> str: + """ + Configuration section of the variation prompt. + + Shows the previous iteration's model, instructions, parameters, and completion + response when available, or the current instance state on the first attempt. + """ + if previous_ctx: + return "\n".join([ + "## Previous Configuration:", + f"Model: {previous_ctx.current_model}", + f"Instructions: {previous_ctx.current_instructions}", + f"Parameters: {previous_ctx.current_parameters}", + "", + "## Previous Result:", + previous_ctx.completion_response, + ]) + else: + return "\n".join([ + "## Current Configuration:", + f"Model: {self._current_model}", + f"Instructions: {self._current_instructions}", + f"Parameters: {self._current_parameters}", + ]) + + def _new_variation_prompt_feedback( + self, previous_ctx: Optional[OptimizationContext] + ) -> str: + """ + Evaluation feedback section of the variation prompt. + + Returns an empty string when there are no scores so it is filtered out + of the assembled prompt entirely. + """ + if not previous_ctx or not previous_ctx.scores: + return "" + + lines = ["## Evaluation Feedback:"] + for judge_key, result in previous_ctx.scores.items(): + optimization_judge = ( + self._options.judges.get(judge_key) + if self._options.judges + else None + ) + if optimization_judge: + score = result.score + if optimization_judge.threshold is not None: + passed = score >= optimization_judge.threshold + status = "PASSED" if passed else "FAILED" + feedback_line = f"- {judge_key}: Score {score:.3f} (threshold: {optimization_judge.threshold}) - {status}" + if result.rationale: + feedback_line += f"\n Reasoning: {result.rationale}" + lines.append(feedback_line) + else: + passed = score >= 1.0 + status = "PASSED" if passed else "FAILED" + feedback_line = f"- {judge_key}: {status}" + if result.rationale: + feedback_line += f"\n Reasoning: {result.rationale}" + lines.append(feedback_line) + return "\n".join(lines) + + def _new_variation_prompt_improvement_instructions( + self, previous_ctx: Optional[OptimizationContext] + ) -> str: + """ + Improvement instructions section of the variation prompt. + + Includes model-choice guidance and the required output format schema. + When previous_ctx is provided, adds feedback-driven improvement directives. + """ + model_instructions = "\n".join([ + "You may also choose to change the model if you believe that the current model is not performing well or a different model would be better suited for the task. " + f"Here are the models you may choose from: {self._options.model_choices}. You must always return a model property, even if it's the same as the current model.", + "When suggesting a new model, you should provide a rationale for why you believe the new model would be better suited for the task.", + ]) + + parameters_instructions = "\n".join([ + "Return these values in a JSON object with the following keys: current_instructions, current_parameters, and model.", + "Example:", + "{", + ' "current_instructions": "...', + ' "current_parameters": {', + ' "...": "..."', + " },", + ' "model": "gpt-4o"', + "}", + "Parameters should only be things that are directly parseable by an LLM call, for example, temperature, max_tokens, etc." + "Do not include any other parameters that are not directly parseable by an LLM call. If you want to provide instruction for tone or other attributes, provide them directly in the instructions.", + ]) + + if previous_ctx: + return "\n".join([ + "## Improvement Instructions:", + "Based on the evaluation feedback above, generate improved agent instructions and parameters.", + "Focus on addressing the areas where the evaluation failed or scored below threshold.", + "The new configuration should aim to improve the agent's performance on the evaluation criteria.", + model_instructions, + "", + "Return the improved configuration in a structured format that can be parsed to update:", + "1. The agent instructions (current_instructions)", + "2. The agent parameters (current_parameters)", + "3. The model (model) - you must always return a model, even if it's the same as the current model.", + parameters_instructions, + ]) + else: + return "\n".join([ + "Generate an improved version of this configuration.", + model_instructions, + parameters_instructions, + ]) + + def _apply_new_variation_response( + self, + response_data: Dict[str, Any], + variation_ctx: OptimizationContext, + response_str: str, + iteration: int, + ) -> OptimizationContext: + """ + Validate the parsed variation response, mutate instance state, and return + an updated OptimizationContext reflecting the new configuration. + + Updates self._current_instructions, self._current_parameters, and + self._current_model in place so subsequent turns use the new configuration. + + :param response_data: Parsed JSON dict from the LLM variation response + :param variation_ctx: The context that was sent to the LLM (used to carry history/iteration) + :param response_str: The raw response string (stored as completion_response) + :param iteration: Current iteration number for logging + :return: A new OptimizationContext populated with the updated configuration + """ + missing_fields = [] + if "current_instructions" not in response_data: + missing_fields.append("current_instructions") + if "current_parameters" not in response_data: + missing_fields.append("current_parameters") + if "model" not in response_data: + missing_fields.append("model") + + if missing_fields: + logger.error( + "[Turn %d] -> Response missing required fields: %s. Received fields: %s. Full response_data: %s", + iteration, + ", ".join(missing_fields), + list(response_data.keys()), + json.dumps(response_data, indent=2), + ) + raise ValueError( + f"Response missing required fields: {', '.join(missing_fields)}. " + f"Received fields: {list(response_data.keys())}" + ) + + self._current_instructions = response_data["current_instructions"] + self._current_parameters = response_data["current_parameters"] + + # Update model — it should always be provided since it's required in the schema + model_value = ( + response_data.get("model", "").strip() + if isinstance(response_data.get("model"), str) + else response_data.get("model") + ) + if not model_value: + logger.warning( + "[Turn %d] -> Model field is empty or None in response, keeping current model %s", + iteration, + self._current_model, + ) + elif model_value not in self._options.model_choices: + logger.warning( + "[Turn %d] -> Model '%s' not in model_choices %s, keeping current model %s", + iteration, + model_value, + self._options.model_choices, + self._current_model, + ) + else: + old_model = self._current_model + self._current_model = model_value + logger.info( + "[Turn %d] -> Model updated from '%s' to '%s'", + iteration, + old_model, + self._current_model, + ) + + logger.info( + "[Turn %d] -> New variation generated: instructions='%.100s...', model=%s, parameters=%s", + iteration, + self._current_instructions, + self._current_model, + self._current_parameters, + ) + + # Create a new context with the updated values for return + return OptimizationContext( + scores={}, + completion_response=response_str, + current_instructions=self._current_instructions, + current_parameters=self._current_parameters.copy(), + current_model=self._current_model, + user_input=None, + history=variation_ctx.history, + iteration=variation_ctx.iteration, + structured_output_tool=variation_ctx.structured_output_tool, + ) + + async def _generate_new_variation(self, iteration: int) -> OptimizationContext: + """ + Generate new variation for next iteration (auto-path). + + Calls handle_agent_call to generate a new variation and updates current_instructions + and current_parameters based on the returned OptimizeContext. + + :param iteration: The current iteration number for logging """ + logger.info("[Turn %d] -> Generating new variation...", iteration) + + # Create a context for status update before generating the variation + status_ctx = self._create_optimization_context(iteration=iteration) + self._safe_status_update("generating variation", status_ctx, iteration) + + # Get the most recent context for previous result and feedback + previous_ctx = self._history[-1] if self._history else None + + instructions = self._build_new_variation_prompt(previous_ctx) + + # Create structured output tool definition for variation generation + structured_output_tool = create_variation_tool(self._options.model_choices) + + # Create a flat history list (without nested history) to avoid exponential growth + flat_history = [prev_ctx.copy_without_history() for prev_ctx in self._history] + + # Create context for variation generation — low temperature for deterministic output + variation_ctx = OptimizationContext( + scores={}, + completion_response="", + current_instructions=instructions, + current_parameters={"temperature": 0.1}, + current_model=self._current_model, + user_input=None, + history=tuple(flat_history), + iteration=len(self._history) + 1, + structured_output_tool=structured_output_tool, + ) + + # Call handle_agent_call to generate new variation; expects a JSON string + # matching the structured output schema (current_instructions, current_parameters, model) + result = self._options.handle_agent_call(self._agent_key, variation_ctx) + response_str = await await_if_needed(result) + + # Extract and update current state from the parsed response + response_data = extract_json_from_response(response_str) + return self._apply_new_variation_response( + response_data, variation_ctx, response_str, iteration + ) + + async def optimize_from_config( + self, agent_key: str, optimization_config_key: str + ) -> Any: + """Optimize an agent from a configuration. + + :param agent_key: Identifier of the agent to optimize. + :param optimization_config_key: Identifier of the optimization configuration to use. + :return: Optimization result. + """ + if not self._has_api_key: + raise ValueError( + "LAUNCHDARKLY_API_KEY is not set, so optimize_from_config is not available" + ) + + self._agent_key = agent_key + agent_config = await self._get_agent_config(agent_key) + raise NotImplementedError + + async def _execute_agent_turn( + self, + optimize_context: OptimizationContext, + iteration: int, + ) -> OptimizationContext: + """ + Run the agent call and judge scoring for one optimization turn. + + Returns a new OptimizationContext with completion_response and scores + populated, leaving the input context unchanged. + + :param optimize_context: The context for this turn (instructions, model, history, etc.) + :param iteration: Current iteration number for logging and status callbacks + :return: Updated context with completion_response and scores filled in + """ + try: + result = self._options.handle_agent_call(self._agent_key, optimize_context) + completion_response = await await_if_needed(result) + except Exception: + logger.exception("[Turn %d] -> Agent call failed", iteration) + if self._options.on_failing_result: + self._options.on_failing_result(optimize_context) + raise + + scores: Dict[str, JudgeResult] = {} + if self._options.judges: + self._safe_status_update("evaluating", optimize_context, iteration) + scores = await self._call_judges(completion_response, iteration) + + return dataclasses.replace( + optimize_context, + completion_response=completion_response, + scores=scores, + ) + + async def _run_optimization( + self, agent_config: AIAgentConfig, options: OptimizationOptions + ) -> Any: + """Run an optimization on the given agent with the given options. + + :param agent_config: Agent configuration from LaunchDarkly. + :param options: Optimization options. + :return: Optimization result. + """ + self._options = options + self._initialize_class_members_from_config(agent_config) + + initial_context = self._create_optimization_context( + iteration=0, + ) + + self._safe_status_update("init", initial_context, 0) + + iteration = 0 + while True: + iteration += 1 + logger.info("[Turn %d] -> Starting", iteration) + user_input = None + if self._options.user_input_options: + user_input = random.choice(self._options.user_input_options) + + optimize_context = self._create_optimization_context( + iteration=iteration, + user_input=user_input, + ) + + self._safe_status_update("generating", optimize_context, iteration) + optimize_context = await self._execute_agent_turn(optimize_context, iteration) + + # Manual path: on_turn callback gives caller full control over pass/fail + if self._options.on_turn is not None: + try: + on_turn_result = self._options.on_turn(optimize_context) + if on_turn_result: + # on_turn returned True — success + return self._handle_success(optimize_context, iteration) + else: + # on_turn returned False — generate new variation and continue + if iteration >= self._options.max_attempts: + return self._handle_failure(optimize_context, iteration) + self._history.append(optimize_context) + await self._generate_new_variation(iteration) + # Notify before starting next turn + self._safe_status_update( + "turn completed", optimize_context, iteration + ) + continue + except Exception as e: + logger.exception( + "[Turn %d] -> on_turn evaluation failed", iteration + ) + self._history.append(optimize_context) + await self._generate_new_variation(iteration) + if iteration >= self._options.max_attempts: + return self._handle_failure(optimize_context, iteration) + self._safe_status_update( + "turn completed", optimize_context, iteration + ) + continue + else: + # Auto-path: judge scores determine pass/fail via _evaluate_response + passes = self._evaluate_response(optimize_context) + if passes: + return self._handle_success(optimize_context, iteration) + else: + self._history.append(optimize_context) + await self._generate_new_variation(iteration) + # Check max_attempts after generating variation + if iteration >= self._options.max_attempts: + return self._handle_failure(optimize_context, iteration) + self._safe_status_update( + "turn completed", optimize_context, iteration + ) + continue diff --git a/packages/optimization/src/ldai_optimization/dataclasses.py b/packages/optimization/src/ldai_optimization/dataclasses.py new file mode 100644 index 0000000..d57b346 --- /dev/null +++ b/packages/optimization/src/ldai_optimization/dataclasses.py @@ -0,0 +1,238 @@ +"""Dataclasses for the LaunchDarkly AI optimization package.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import ( + Any, + Awaitable, + Callable, + Dict, + List, + Literal, + Optional, + Sequence, + Union, +) + +from ldclient import Context + + +@dataclass +class JudgeResult: + """Result from a judge evaluation.""" + + score: float + rationale: Optional[str] = None + + def to_json(self) -> Dict[str, Any]: + """ + Convert the judge result to a JSON-serializable dictionary. + + :return: Dictionary representation of the judge result that can be serialized with json.dumps() + """ + return { + "score": self.score, + "rationale": self.rationale, + } + + +@dataclass +class StructuredOutputTool: + """ + Generic tool definition for enforcing structured output from LLM responses. + + This tool can be used with any LLM provider to ensure responses conform to + a specific JSON schema. The tool takes the LLM's response and returns + parsed and validated data according to the input_schema. + """ + + name: str + description: str + input_schema: Dict[str, Any] # JSON schema defining the expected output structure + + def to_dict(self) -> Dict[str, Any]: + """ + Convert the tool definition to a dictionary format compatible with LLM APIs. + + :return: Dictionary representation of the tool + """ + return { + "name": self.name, + "description": self.description, + "input_schema": self.input_schema, + } + + +@dataclass +class Message: + """A message in a conversation.""" + + role: Literal["system", "user", "assistant"] + content: str + + def to_dict(self) -> Dict[str, str]: + """Convert message to dictionary format.""" + return { + "role": self.role, + "content": self.content, + } + + +@dataclass +class OptimizationJudge: + threshold: float + judge_key: Optional[str] = None + acceptance_statement: Optional[str] = None + + +@dataclass +class AutoCommitConfig: + """Configuration for auto-committing optimization results to LaunchDarkly.""" + + enabled: bool = False + project_key: Optional[str] = None + + +@dataclass +class OptimizationContext: + """Context for a single optimization iteration.""" + + scores: Dict[str, JudgeResult] # the scores and rationales from the judges, if configured + completion_response: str + current_instructions: str + current_parameters: Dict[str, Any] + current_model: Optional[str] = None # the current model being used + user_input: Optional[str] = None # the user input message for this iteration + history: Sequence[OptimizationContext] = field( + default_factory=list + ) # previous context items + iteration: int = 0 # current iteration number + structured_output_tool: Optional[StructuredOutputTool] = ( + None # tool definition for structured output + ) + + def copy_without_history(self) -> OptimizationContext: + """ + Create a copy of this context without the history field (for flattening). + + :return: A new OptimizeContext with the same data but empty history + """ + return OptimizationContext( + scores=self.scores, + completion_response=self.completion_response, + current_instructions=self.current_instructions, + current_parameters=self.current_parameters, + current_model=self.current_model, + user_input=self.user_input, + history=(), # Empty history to keep it flat + iteration=self.iteration, + structured_output_tool=self.structured_output_tool, + ) + + def to_json(self) -> Dict[str, Any]: + """ + Convert the optimization context to a JSON-serializable dictionary. + + :return: Dictionary representation of the context that can be serialized with json.dumps() + """ + scores_dict = {} + for judge_key, judge_result in self.scores.items(): + scores_dict[judge_key] = judge_result.to_json() + + structured_output_tool_dict = None + if self.structured_output_tool: + structured_output_tool_dict = self.structured_output_tool.to_dict() + + history_list = [ctx.to_json() for ctx in self.history] + + return { + "scores": scores_dict, + "completion_response": self.completion_response, + "current_instructions": self.current_instructions, + "current_parameters": self.current_parameters, + "current_model": self.current_model, + "user_input": self.user_input, + "history": history_list, + "iteration": self.iteration, + "structured_output_tool": structured_output_tool_dict, + } + + +@dataclass +class OptimizationJudgeContext: + """Context for judge evaluation.""" + + messages: List[Message] + parameters: Dict[str, Any] + tools: Optional[List[Dict[str, Any]]] = None + + +@dataclass +class OptimizationOptions: + """Options for agent optimization.""" + + # Required + context_choices: List[Context] # choices of contexts to be used, 1 min required + # Configuration + max_attempts: int + model_choices: List[str] # model ids the LLM can choose from, 1 min required + judge_model: str # which model to use as judge; this should remain consistent + variable_choices: List[ + Dict[str, Any] + ] # choices of interpolated variables to be chosen at random per turn, 1 min required + # Actual agent/completion (judge) calls - Required + handle_agent_call: Union[ + Callable[[str, OptimizationContext], str], + Callable[[str, OptimizationContext], Awaitable[str]], + ] + handle_judge_call: Union[ + Callable[[str, OptimizationContext], str], + Callable[[str, OptimizationJudgeContext], Awaitable[str]], + ] + # Criteria for pass/fail - Optional + user_input_options: Optional[List[str]] = ( + None # optional list of user input messages to randomly select from + ) + judges: Optional[Dict[str, OptimizationJudge]] = ( + None # auto-judges for this model that the LLM will use + ) + on_turn: Optional[Callable[[OptimizationContext], bool]] = ( + None # if you want manual control of pass/fail + ) + # Results - Optional + auto_commit: Optional[AutoCommitConfig] = ( + None # configuration for automatically saving results back to LaunchDarkly + ) + on_passing_result: Optional[Callable[[OptimizationContext], None]] = None + on_failing_result: Optional[Callable[[OptimizationContext], None]] = None + on_status_update: Optional[ + Callable[ + [ + Literal[ + "init", + "generating", + "evaluating", + "generating variation", + "turn completed", + "success", + "failure", + ], + OptimizationContext, + ], + None, + ] + ] = None # called to provide status updates during the optimization flow + + def __post_init__(self): + """Validate required options.""" + if len(self.context_choices) < 1: + raise ValueError("context_choices must have at least 1 context") + if len(self.model_choices) < 1: + raise ValueError("model_choices must have at least 1 model") + if len(self.variable_choices) < 1: + raise ValueError("variable_choices must have at least 1 variable choice") + if self.judges is None and self.on_turn is None: + raise ValueError("Either judges or on_turn must be provided") + if self.judge_model is None: + raise ValueError("judge_model must be provided") diff --git a/packages/optimization/src/ldai_optimization/util.py b/packages/optimization/src/ldai_optimization/util.py new file mode 100644 index 0000000..1f48405 --- /dev/null +++ b/packages/optimization/src/ldai_optimization/util.py @@ -0,0 +1,201 @@ +"""Utility functions for the LaunchDarkly AI optimization package.""" + +import inspect +import json +import logging +import re +from typing import Any, Awaitable, Dict, List, Optional, Union + +from ldai_optimization.dataclasses import StructuredOutputTool + +logger = logging.getLogger(__name__) + + +async def await_if_needed( + result: Union[str, Awaitable[str]] +) -> str: + """ + Handle both sync and async callable results. + + :param result: Either a string or an awaitable that returns a string + :return: The string result + """ + if inspect.iscoroutine(result): + return await result + else: + return result + + +def create_evaluation_tool() -> StructuredOutputTool: + """ + Create the structured output tool for judge evaluations. + + :return: A StructuredOutputTool for evaluation responses + """ + return StructuredOutputTool( + name="return_evaluation", + description="Returns an evaluation with a score and rationale.", + input_schema={ + "type": "object", + "properties": { + "score": { + "type": "number", + "description": "The evaluation score (typically 0.0 to 1.0)", + }, + "rationale": { + "type": "string", + "description": "Explanation of the evaluation", + }, + }, + "required": ["score", "rationale"], + }, + ) + + +def create_boolean_tool() -> StructuredOutputTool: + """ + Create the structured output tool for acceptance judges. + + :return: A StructuredOutputTool for boolean evaluation responses + """ + return StructuredOutputTool( + name="return_boolean", + description="Returns a boolean value and reasoning for the evaluation.", + input_schema={ + "type": "object", + "properties": { + "passed": { + "type": "boolean", + "description": "Whether the response passes the evaluation criteria", + }, + "rationale": { + "type": "string", + "description": "Explanation of the evaluation decision", + }, + }, + "required": ["passed", "rationale"], + }, + ) + + +def create_variation_tool(model_choices: List[str]) -> StructuredOutputTool: + """ + Create the structured output tool for variation generation. + + :param model_choices: List of model IDs the LLM may select from + :return: A StructuredOutputTool for variation generation responses + """ + return StructuredOutputTool( + name="return_improved_configuration", + description=( + "Returns the improved agent configuration with updated instructions and parameters. " + "This tool enforces structured output to ensure the response can be parsed and validated." + ), + input_schema={ + "type": "object", + "properties": { + "current_instructions": { + "type": "string", + "description": "The improved agent instructions based on the evaluation feedback", + }, + "current_parameters": { + "type": "object", + "description": "The improved agent parameters (e.g., temperature, max_tokens, etc.)", + "additionalProperties": True, + }, + "model": { + "type": "string", + "description": "The model to use for the improved agent", + "enum": model_choices, + }, + }, + "required": ["current_instructions", "current_parameters", "model"], + "additionalProperties": False, + }, + ) + + +def extract_json_from_response(response_str: str) -> Dict[str, Any]: + """ + Parse a JSON object from an LLM response string. + + Attempts direct JSON parsing first, then progressively falls back to + extracting JSON from markdown code blocks and balanced-brace scanning. + + :param response_str: Raw string response from an LLM + :return: Parsed dictionary + :raises ValueError: If no valid JSON object can be extracted + """ + # Try direct parse first + try: + return json.loads(response_str) + except json.JSONDecodeError: + pass + + response_data: Optional[Dict[str, Any]] = None + + # Try to extract JSON from markdown code blocks + code_block_match = re.search( + r'```(?:json)?\s*(\{.*?\})\s*```', + response_str, + re.DOTALL, + ) + if code_block_match: + try: + response_data = json.loads(code_block_match.group(1)) + except json.JSONDecodeError: + pass + + # Try balanced-brace scanning + if response_data is None: + brace_count = 0 + start_idx = response_str.find('{') + if start_idx != -1: + for i in range(start_idx, len(response_str)): + if response_str[i] == '{': + brace_count += 1 + elif response_str[i] == '}': + brace_count -= 1 + if brace_count == 0: + json_str = response_str[start_idx:i + 1] + try: + response_data = json.loads(json_str) + break + except json.JSONDecodeError: + start_idx = response_str.find('{', start_idx + 1) + if start_idx == -1: + break + brace_count = 0 + + # Legacy regex fallback + if response_data is None: + json_match = re.search( + r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*"current_instructions"[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', + response_str, + re.DOTALL, + ) + if json_match: + try: + response_data = json.loads(json_match.group()) + except json.JSONDecodeError: + logger.error( + "Extracted JSON string failed to parse: %s", + json_match.group()[:200], + ) + raise ValueError( + "Failed to parse extracted JSON from variation generation response" + ) + + if response_data is None: + logger.error( + "Failed to extract JSON from response. " + "Response length: %d, first 200 chars: %s", + len(response_str), + response_str[:200], + ) + raise ValueError( + "Failed to parse structured output from variation generation. " + "Expected JSON object with 'current_instructions', 'current_parameters', and 'model' fields." + ) + + return response_data diff --git a/packages/optimization/tests/test_package.py b/packages/optimization/tests/test_package.py index 2123eb6..a8356a8 100644 --- a/packages/optimization/tests/test_package.py +++ b/packages/optimization/tests/test_package.py @@ -2,7 +2,7 @@ import pytest -from ldai_optimization import ApiAgentOptimizationClient, __version__ +from ldai_optimization import OptimizationClient, __version__ def test_version_is_string(): @@ -11,6 +11,6 @@ def test_version_is_string(): def test_optimize_not_implemented(): - client = ApiAgentOptimizationClient() + client = OptimizationClient() with pytest.raises(NotImplementedError): client.optimize("example", {}) From 1712e4f687a0d1925a16758fe969b252ed70399c Mon Sep 17 00:00:00 2001 From: Andrew Klatzke Date: Mon, 30 Mar 2026 16:46:30 -0800 Subject: [PATCH 02/11] feat: implementation of agent optimization + tests --- .../src/ldai_optimization/client.py | 933 ++++++++++++++---- .../src/ldai_optimization/dataclasses.py | 48 +- .../src/ldai_optimization/util.py | 67 +- packages/optimization/tests/test_client.py | 901 +++++++++++++++++ 4 files changed, 1746 insertions(+), 203 deletions(-) create mode 100644 packages/optimization/tests/test_client.py diff --git a/packages/optimization/src/ldai_optimization/client.py b/packages/optimization/src/ldai_optimization/client.py index b4d4872..5674e99 100644 --- a/packages/optimization/src/ldai_optimization/client.py +++ b/packages/optimization/src/ldai_optimization/client.py @@ -7,22 +7,27 @@ import random import json -from ldai import LDAIClient, AIJudgeConfigDefault, AIAgentConfig +from ldai import LDAIClient, AIJudgeConfig, AIJudgeConfigDefault, AIAgentConfig +from ldai.models import ModelConfig +from ldclient import Context from ldai_optimization.dataclasses import ( AutoCommitConfig, JudgeResult, - Message, OptimizationContext, OptimizationJudge, OptimizationJudgeContext, OptimizationOptions, + StructuredOutputTool, ) from ldai_optimization.util import ( await_if_needed, create_evaluation_tool, create_variation_tool, extract_json_from_response, + handle_evaluation_tool_call, + handle_variation_tool_call, + interpolate_variables, ) logger = logging.getLogger(__name__) @@ -31,9 +36,11 @@ class OptimizationClient: _options: OptimizationOptions _ldClient: LDAIClient + _agent_config: AIAgentConfig _has_api_key: bool _api_key: Optional[str] _agent_key: str + _initial_instructions: str def __init__(self, ldClient: LDAIClient) -> None: self._ldClient = ldClient @@ -58,9 +65,40 @@ def _initialize_class_members_from_config( ) self._history: List[OptimizationContext] = [] + def _build_agent_config_for_context( + self, ctx: OptimizationContext + ) -> AIAgentConfig: + """ + Construct an AIAgentConfig that reflects the current optimization iteration. + + Uses the instructions, model, and parameters from the given context so the + caller receives the variation being evaluated rather than the original base config. + ``{{placeholder}}`` tokens in the instructions are substituted using + ctx.current_variables at call time so the stored template is never mutated. + + :param ctx: The OptimizationContext for this iteration + :return: A fresh AIAgentConfig populated from the context's current state + """ + instructions = ( + interpolate_variables(ctx.current_instructions, ctx.current_variables) + if ctx.current_variables + else ctx.current_instructions + ) + return AIAgentConfig( + key=self._agent_key, + enabled=True, + model=ModelConfig( + name=ctx.current_model or "", + parameters=ctx.current_parameters, + ), + instructions=instructions, + provider=self._agent_config.provider, + ) + def _create_optimization_context( self, iteration: int, + variables: Dict[str, Any], user_input: Optional[str] = None, completion_response: str = "", scores: Optional[Dict[str, JudgeResult]] = None, @@ -69,6 +107,7 @@ def _create_optimization_context( Create an OptimizeContext with current state. :param iteration: Current iteration number + :param variables: Variable set chosen for this iteration :param user_input: Optional user input for this iteration :param completion_response: Completion response string :param scores: Optional dictionary of judge results @@ -80,6 +119,7 @@ def _create_optimization_context( completion_response=completion_response, current_instructions=self._current_instructions, current_parameters=self._current_parameters.copy(), + current_variables=variables, current_model=self._current_model, user_input=user_input, history=tuple(flat_history), @@ -101,11 +141,175 @@ def _safe_status_update( self._options.on_status_update(status, context.copy_without_history()) except Exception as e: logger.exception( - "[Turn %d] -> on_status_update callback failed", iteration + "[Iteration %d] -> on_status_update callback failed", iteration ) + def _judge_config( + self, + judge_key: str, + context: Context, + default: AIJudgeConfigDefault, + variables: Dict[str, Any], + ) -> AIJudgeConfig: + """ + Fetch a judge configuration from the LaunchDarkly client. + + Thin wrapper around LDAIClient.judge_config so callers do not need a + direct reference to the client. + + :param judge_key: The key for the judge configuration in LaunchDarkly + :param context: The evaluation context + :param default: Fallback config when the flag is disabled or unreachable + :param variables: Template variables for instruction interpolation + :return: The resolved AIJudgeConfig + """ + return self._ldClient.judge_config(judge_key, context, default, variables) + + def _build_message_history_text( + self, + input_text: str, + reasoning_history: str, + current_user_input: str, + ) -> str: + """ + Build a formatted message-history string for use as a judge template variable. + + Combines the current instructions (system text), the conversation turns + recorded in self._history, the current turn's user question, and the + accumulated reasoning/score history. + + :param input_text: Current system instructions (may be empty string) + :param reasoning_history: Pre-formatted string from _build_reasoning_history + :param current_user_input: The user question for the turn being evaluated. + Must be passed explicitly because the current turn is not yet in + self._history when the judge runs. + :return: Combined string to substitute into the judge's message_history variable + """ + turn_messages = [] + for ctx in self._history: + if ctx.user_input: + turn_messages.append(f"User: {ctx.user_input}") + if ctx.completion_response: + turn_messages.append(f"Assistant: {ctx.completion_response}") + + # Include the current turn's question so judges see what was actually asked + turn_messages.append(f"User: {current_user_input}") + + sections = [input_text, "\n".join(turn_messages), reasoning_history] + return "\n\n".join(s for s in sections if s) + + def _serialize_scores( + self, judge_results: Dict[str, JudgeResult] + ) -> Dict[str, Any]: + """ + Convert judge results to a JSON-serializable dictionary. + + :param judge_results: Dictionary of judge keys to JudgeResult instances + :return: Dictionary suitable for json.dumps + """ + return {key: result.to_json() for key, result in judge_results.items()} + + def _extract_agent_tools(self, parameters: Dict[str, Any]) -> List[StructuredOutputTool]: + """ + Extract and normalise the tools list from agent parameters. + + Reads the ``tools`` key from *parameters* (if present) and converts + every entry to a StructuredOutputTool so judges receive typed objects. + + :param parameters: The agent's current_parameters dict + :return: List of StructuredOutputTool instances, empty list if no tools are configured + """ + raw_tools = parameters.get("tools", []) + if not raw_tools: + return [] + if not isinstance(raw_tools, list): + raw_tools = [raw_tools] + + result = [] + for tool in raw_tools: + if isinstance(tool, StructuredOutputTool): + result.append(tool) + elif hasattr(tool, "to_dict"): + result.append(StructuredOutputTool.from_dict(tool.to_dict())) + elif isinstance(tool, dict): + result.append(StructuredOutputTool.from_dict(tool)) + return result + + def _parse_judge_response( + self, + response_str: str, + judge_key: str, + judge_identifier: str, + iteration: int, + clamp_score: bool = True, + ) -> JudgeResult: + """ + Parse a structured LLM judge response into a JudgeResult. + + Expects a JSON object with "score" (float) and optionally "rationale" + (str). On any parsing failure, logs the exception and returns a zero score. + + :param response_str: Raw string response from the judge LLM + :param judge_key: Key used to identify this judge in results dicts + :param judge_identifier: Human-readable identifier for log messages + :param iteration: Current iteration number for logging + :param clamp_score: When True, clamps score to [0.0, 1.0] + :return: Parsed JudgeResult, or a zero-score result on failure + """ + try: + response_data = extract_json_from_response(response_str) + score = float(response_data.get("score", 0.0)) + if clamp_score: + score = max(0.0, min(1.0, score)) + rationale = response_data.get("rationale") + return JudgeResult(score=score, rationale=rationale) + except Exception: + logger.exception( + "[Iteration %d] -> Failed to parse judge response for %s", + iteration, + judge_identifier, + ) + return JudgeResult(score=0.0, rationale=None) + + def _builtin_judge_tool_handlers(self) -> Dict[str, Any]: + """ + Build the dict of built-in tool name → handler passed to handle_judge_call. + + Each handler accepts the tool-call arguments dict produced by the LLM and + returns a JSON string so the caller can forward it back to the model or use + it directly as the judge response. + + :return: Mapping of built-in tool names to their handler callables + """ + return { + create_evaluation_tool().name: handle_evaluation_tool_call, + } + + def _builtin_agent_tool_handlers(self, is_variation: bool) -> Dict[str, Any]: + """ + Build the dict of built-in tool name → handler passed to handle_agent_call. + + For regular agent turns this is empty — the config only contains user-defined + tools from the LD flag. For variation-generation turns the variation structured + output tool is included so the caller can distinguish it from user tools and + route the LLM tool call back to the framework. + + :param is_variation: True when called for a variation-generation turn + :return: Mapping of built-in tool names to their handler callables + """ + if is_variation: + return { + create_variation_tool(self._options.model_choices).name: handle_variation_tool_call, + } + return {} + async def _call_judges( - self, completion_response: str, iteration: int + self, + completion_response: str, + iteration: int, + user_input: str, + variables: Optional[Dict[str, Any]] = None, + agent_tools: Optional[List[StructuredOutputTool]] = None, ) -> Dict[str, JudgeResult]: """ Call all judges in parallel (auto-path). @@ -115,16 +319,30 @@ async def _call_judges( :param completion_response: The agent's completion response to evaluate :param iteration: Current iteration number + :param user_input: The user's question for this turn, forwarded to judges so + they know what was actually asked (the current turn is not yet in + self._history when judges run) + :param variables: The variable set that was used during the agent generation + :param agent_tools: Normalised list of tool dicts that were available to the agent :return: Dictionary of judge results (score and rationale) """ if not self._options.judges: return {} - logger.info("[Turn %d] -> Executing evaluation...", iteration) + resolved_variables: Dict[str, Any] = variables or {} + resolved_agent_tools: List[Dict[str, Any]] = agent_tools or [] + + logger.info("[Iteration %d] -> Executing evaluation...", iteration) reasoning_history = self._build_reasoning_history() judge_results: Dict[str, JudgeResult] = {} - for judge_key, optimization_judge in self._options.judges.items(): + judge_count = len(self._options.judges) + for idx, (judge_key, optimization_judge) in enumerate(self._options.judges.items(), 1): + judge_type = "config" if optimization_judge.judge_key is not None else "acceptance" + logger.info( + "[Iteration %d] -> Running judge %d/%d '%s' (%s)...", + iteration, idx, judge_count, judge_key, judge_type, + ) try: if optimization_judge.judge_key is not None: result = await self._evaluate_config_judge( @@ -133,6 +351,9 @@ async def _call_judges( completion_response, iteration, reasoning_history, + user_input=user_input, + variables=resolved_variables, + agent_tools=resolved_agent_tools, ) judge_results[judge_key] = result else: @@ -142,17 +363,32 @@ async def _call_judges( completion_response, iteration, reasoning_history, + user_input=user_input, + variables=resolved_variables, + agent_tools=resolved_agent_tools, ) judge_results[judge_key] = result + + threshold = optimization_judge.threshold if optimization_judge.threshold is not None else 1.0 + passed = result.score >= threshold + logger.info( + "[Iteration %d] -> Judge '%s' scored %.3f (threshold=%.3f) -> %s%s", + iteration, + judge_key, + result.score, + threshold, + "PASSED" if passed else "FAILED", + f" | {result.rationale}" if result.rationale else "", + ) except Exception as e: logger.exception( - "[Turn %d] -> Judge %s evaluation failed", iteration, judge_key + "[Iteration %d] -> Judge %s evaluation failed", iteration, judge_key ) judge_results[judge_key] = JudgeResult(score=0.0, rationale=None) judge_results_json = self._serialize_scores(judge_results) logger.info( - "[Turn %d] -> Evaluation result: %s", + "[Iteration %d] -> Evaluation result: %s", iteration, json.dumps(judge_results_json, indent=2), ) @@ -165,6 +401,9 @@ async def _evaluate_config_judge( completion_response: str, iteration: int, reasoning_history: str, + user_input: str, + variables: Optional[Dict[str, Any]] = None, + agent_tools: Optional[List[StructuredOutputTool]] = None, ) -> JudgeResult: """ Evaluate using a config-type judge (with judge_key). @@ -174,28 +413,36 @@ async def _evaluate_config_judge( :param completion_response: The agent's completion response to evaluate :param iteration: Current iteration number :param reasoning_history: Formatted string of reasoning from previous iterations + :param user_input: The user's question for this turn + :param variables: The variable set that was used during agent generation + :param agent_tools: Normalised list of tool dicts that were available to the agent :return: The judge result with score and rationale """ # Config-type judge: fetch judge config on-demand from LaunchDarkly SDK input_text = self._current_instructions or "" - # Combine current instructions with reasoning history for message_history + # Combine current instructions, history, and current question for message_history message_history_text = self._build_message_history_text( - input_text, reasoning_history + input_text, reasoning_history, current_user_input=user_input ) + # Merge agent variables so the judge's LD-managed instructions can reference + # {{variable_name}} tokens alongside the standard judge template variables. + template_variables: Dict[str, Any] = { + **(variables or {}), + "message_history": message_history_text, + "response_to_evaluate": completion_response, + } + judge_config = self._judge_config( optimization_judge.judge_key, self._options.context_choices[0], AIJudgeConfigDefault(enabled=False), - { - "message_history": message_history_text, - "response_to_evaluate": completion_response, - }, + template_variables, ) if not judge_config.enabled: logger.warning( - "[Turn %d] -> Judge %s is disabled", + "[Iteration %d] -> Judge %s is disabled", iteration, optimization_judge.judge_key, ) @@ -203,62 +450,75 @@ async def _evaluate_config_judge( if not judge_config.messages: logger.warning( - "[Turn %d] -> Judge %s has no messages", + "[Iteration %d] -> Judge %s has no messages", iteration, optimization_judge.judge_key, ) return JudgeResult(score=0.0, rationale=None) - # Convert LDMessage to Message objects, appending structured output instruction to system messages - judge_messages = [] + # Collapse all system messages into a single instructions string; collect the user message + system_parts = [] + user_parts = [] for msg in judge_config.messages: - content = msg.content if msg.role == "system": - content += " Use the structured output tool to format your response. You should always return a JSON object with a score and rationale." - judge_messages.append(Message(role=msg.role, content=content)) - - # Build parameters from judge config, hoisting any pre-existing tools so we can append ours - parameters = {} - tools = [] - if judge_config.model: - parameters["model"] = judge_config.model.name - if judge_config.model._parameters: - # Extract tools if present - existing_tools = judge_config.model._parameters.get("tools") - if existing_tools: - tools = ( - existing_tools - if isinstance(existing_tools, list) - else [existing_tools] - ) - # Convert to dicts if needed - tools = [ - tool.to_dict() if hasattr(tool, "to_dict") else tool - for tool in tools - ] - # Copy parameters excluding tools - parameters.update( - { - k: v - for k, v in judge_config.model._parameters.items() - if k != "tools" - } + system_parts.append( + msg.content + + " Use the structured output tool to format your response." + " You should always return a JSON object with a score and rationale." ) + elif msg.role == "user": + user_parts.append(msg.content) + + instructions = "\n\n".join(system_parts) + judge_user_input = "\n\n".join(user_parts) if user_parts else f"Here is the response to evaluate: {completion_response}" + + # Collect model parameters from the judge config, separating out any existing tools + model_name = judge_config.model.name if judge_config.model else self._options.judge_model + model_params: Dict[str, Any] = {} + tools: List[StructuredOutputTool] = [] + if judge_config.model and judge_config.model._parameters: + existing_tools = judge_config.model._parameters.get("tools") + if existing_tools: + raw = existing_tools if isinstance(existing_tools, list) else [existing_tools] + for t in raw: + if isinstance(t, StructuredOutputTool): + tools.append(t) + elif hasattr(t, "to_dict"): + tools.append(StructuredOutputTool.from_dict(t.to_dict())) + elif isinstance(t, dict): + tools.append(StructuredOutputTool.from_dict(t)) + model_params = {k: v for k, v in judge_config.model._parameters.items() if k != "tools"} + + # Prepend agent tools so the judge can call them when verifying the response + if agent_tools: + tools = list(agent_tools) + tools # Add structured output tool for score and rationale - tools.append(create_evaluation_tool().to_dict()) + tools.append(create_evaluation_tool()) + + judge_agent_config = AIAgentConfig( + key=judge_key, + enabled=True, + model=ModelConfig( + name=model_name, + parameters={**model_params, "tools": [t.to_dict() for t in tools]}, + ), + instructions=instructions, + provider=self._agent_config.provider, + ) judge_ctx = OptimizationJudgeContext( - messages=judge_messages, - parameters=parameters, - tools=tools, + user_input=judge_user_input, + variables=variables or {}, ) - result = self._options.handle_judge_call(self._options.judge_model, judge_ctx) + result = self._options.handle_judge_call( + judge_key, judge_agent_config, judge_ctx, self._builtin_judge_tool_handlers() + ) judge_response_str = await await_if_needed(result) logger.info( - "[Turn %d] -> Judge response (%s): %s", + "[Iteration %d] -> Judge response (%s): %s", iteration, judge_key, judge_response_str, @@ -281,6 +541,9 @@ async def _evaluate_acceptance_judge( completion_response: str, iteration: int, reasoning_history: str, + user_input: str, + variables: Optional[Dict[str, Any]] = None, + agent_tools: Optional[List[StructuredOutputTool]] = None, ) -> JudgeResult: """ Evaluate using an acceptance statement judge. @@ -290,61 +553,77 @@ async def _evaluate_acceptance_judge( :param completion_response: The agent's completion response to evaluate :param iteration: Current iteration number :param reasoning_history: Formatted string of reasoning from previous iterations + :param user_input: The user's question for this turn + :param variables: The variable set that was used during agent generation + :param agent_tools: Normalised list of tool dicts that were available to the agent :return: The judge result with score and rationale """ if not optimization_judge.acceptance_statement: logger.error( - "[Turn %d] -> Judge %s has no acceptance_statement", + "[Iteration %d] -> Judge %s has no acceptance_statement", iteration, judge_key, ) return JudgeResult(score=0.0, rationale=None) - # Build message history with reasoning for context - message_history_text = self._build_message_history_text("", reasoning_history) - - # Build judge context for LLM call - judge_messages = [ - Message( - role="system", - content=f"""You are a judge that evaluates the response to the user's question. - - Here is the statement that you should evaluate the response against: '{optimization_judge.acceptance_statement}' - Here is the history of all messages between the user and the assistant: {message_history_text} - You should score the response based on how well it meets the acceptance statement using a score between 0.0 and 1.0. - A score of 0.0 means it does not match at all, while a score of 1.0 means it matches perfectly. - A score of 0.3-0.7 means it matches partially, while a score of 0.7-1.0 means it matches well. - A score of 0.0-0.3 means that it does not match well at all. You can return any value between 0.0 and 1.0. - You should also provide a rationale for your score. - You should call the structured output tool to format your response. - - Here is an example of a good response: - {{ - "score": 0.8, - "rationale": "The response matches the acceptance statement well. It provides a detailed explanation of the concept and its applications." - }} - """, - ), - Message( - role="user", - content=f"Here is the response to evaluate: {completion_response}", - ), - ] + resolved_variables = variables or {} + resolved_agent_tools = agent_tools or [] + + # Build message history including the current user question + message_history_text = self._build_message_history_text( + "", reasoning_history, current_user_input=user_input + ) + + # Build instructions for the judge + instructions = ( + f"You are a judge that evaluates the response to the user's question.\n\n" + f"Here is the statement that you should evaluate the response against: '{optimization_judge.acceptance_statement}'\n" + f"Here is the history of all messages between the user and the assistant: {message_history_text}\n" + f"You should score the response based on how well it meets the acceptance statement using a score between 0.0 and 1.0.\n" + f"A score of 0.0 means it does not match at all, while a score of 1.0 means it matches perfectly.\n" + f"A score of 0.3-0.7 means it matches partially, while a score of 0.7-1.0 means it matches well.\n" + f"A score of 0.0-0.3 means that it does not match well at all. You can return any value between 0.0 and 1.0.\n" + f"You should also provide a rationale for your score.\n" + f"You should call the structured output tool to format your response.\n\n" + f'Example: {{"score": 0.8, "rationale": "The response matches the acceptance statement well."}}' + ) + + if resolved_variables: + instructions += f"\n\nThe following variables were available to the agent: {json.dumps(resolved_variables)}" + + if resolved_agent_tools: + tool_names = [t.name for t in resolved_agent_tools] + instructions += ( + f"\n\nThe following tools were available to the agent and may be called by you to verify the response: {json.dumps(tool_names)}." + "\nIf verifying the response requires looking up external information, call the appropriate tool before scoring." + "You should only call the tools for the most recent response, and should only call the tools if necessary. Assume that previous feedback will have addressed bad tool call results from prior iterations." + ) + + # Prepend agent tools so the judge can invoke them for verification if needed + tools: List[StructuredOutputTool] = list(resolved_agent_tools) + [create_evaluation_tool()] - # Create structured output tool for evaluation response with score and rationale - evaluation_tool = create_evaluation_tool() + judge_agent_config = AIAgentConfig( + key=judge_key, + enabled=True, + model=ModelConfig( + name=self._options.judge_model, + parameters={"tools": [t.to_dict() for t in tools]}, + ), + instructions=instructions, + ) judge_ctx = OptimizationJudgeContext( - messages=judge_messages, - parameters={"model": self._options.judge_model}, - tools=[evaluation_tool.to_dict()], + user_input=f"Here is the response to evaluate: {completion_response}", + variables=resolved_variables, ) - result = self._options.handle_judge_call(self._options.judge_model, judge_ctx) + result = self._options.handle_judge_call( + judge_key, judge_agent_config, judge_ctx, self._builtin_judge_tool_handlers() + ) judge_response = await await_if_needed(result) logger.info( - "[Turn %d] -> Judge response (%s): %s", iteration, judge_key, judge_response + "[Iteration %d] -> Judge response (%s): %s", iteration, judge_key, judge_response ) # Parse judge response — expect structured JSON output with score and rationale @@ -352,18 +631,32 @@ async def _evaluate_acceptance_judge( judge_response, judge_key, judge_key, iteration, clamp_score=True ) - async def _get_agent_config(self, agent_key: str) -> AIAgentConfig: + async def _get_agent_config(self, agent_key: str, context: Context) -> AIAgentConfig: """ - Get the agent configuration from the LaunchDarkly client. + Fetch the agent configuration, replacing the instructions with the raw variation + template so that {{placeholder}} tokens are preserved for client-side interpolation. + + agent_config() is called normally so we get a fully populated AIAgentConfig + (including the tracker). We then call variation() separately to retrieve the + unrendered instruction template and swap it in, keeping everything else intact. :param agent_key: The key for the agent to get the configuration for - :return: The agent configuration + :param context: The evaluation context + :return: AIAgentConfig with raw {{placeholder}} instruction templates intact """ try: - agent_config = await self._ldClient.agent_config(agent_key) - self._initialize_class_members(agent_config) + agent_config = self._ldClient.agent_config(agent_key, context) + + # variation() returns the raw JSON before chevron.render(), so instructions + # still contain {{placeholder}} tokens rather than empty strings. + raw_variation = self._ldClient._client.variation(agent_key, context, {}) + raw_instructions = raw_variation.get("instructions", agent_config.instructions) + self._initial_instructions = raw_instructions + + agent_config = dataclasses.replace(agent_config, instructions=raw_instructions) + self._initialize_class_members_from_config(agent_config) return agent_config - except Exception as e: + except Exception: logger.exception("[Optimization] -> Failed to get agent configuration") raise @@ -377,30 +670,55 @@ async def optimize_from_options( :return: Optimization result. """ self._agent_key = agent_key - agent_config = await self._get_agent_config(agent_key) + context = random.choice(options.context_choices) + agent_config = await self._get_agent_config(agent_key, context) return await self._run_optimization(agent_config, options) + def _build_reasoning_history(self) -> str: + """ + Build a formatted string of reasoning from previous iterations. + + :return: Formatted string containing reasoning history + """ + if not self._history: + return "" + + reasoning_parts = [] + for i, prev_ctx in enumerate(self._history, 1): + if prev_ctx.scores: + reasoning_parts.append(f"## Iteration {i} Judge Evaluations:") + for judge_key, result in prev_ctx.scores.items(): + reasoning_parts.append(f"- {judge_key}: Score {result.score}") + if result.rationale: + reasoning_parts.append(f" Reasoning: {result.rationale}") + reasoning_parts.append("") + + return "\n".join(reasoning_parts) + def _build_new_variation_prompt( - self, previous_ctx: Optional[OptimizationContext] + self, history: List[OptimizationContext] ) -> str: """ Build the LLM prompt for generating an improved agent configuration. - Constructs a detailed instruction string based on the previous iteration's - configuration, completion result, and judge scores. When no previous context - exists (first variation attempt), asks the LLM to improve the current config - without evaluation feedback. + Constructs a detailed instruction string based on the full optimization + history, including all previous configurations, completion results, and + judge scores. When history is empty (first variation attempt), asks the + LLM to improve the current config without evaluation feedback. - :param previous_ctx: The most recent OptimizationContext, or None on the first attempt + :param history: All previous OptimizationContexts, oldest first. Empty on the first attempt. :return: The assembled prompt string """ sections = [ self._new_variation_prompt_preamble(), - self._new_variation_prompt_configuration(previous_ctx), - self._new_variation_prompt_feedback(previous_ctx), - self._new_variation_prompt_improvement_instructions(previous_ctx), + self._new_variation_prompt_acceptance_criteria(), + self._new_variation_prompt_configuration(history), + self._new_variation_prompt_feedback(history), + self._new_variation_prompt_improvement_instructions(history), ] - return "\n\n".join(s for s in sections if s) + + built_prompt = "\n\n".join(s for s in sections if s) + return built_prompt def _new_variation_prompt_preamble(self) -> str: """Static opening section for the variation generation prompt.""" @@ -408,27 +726,75 @@ def _new_variation_prompt_preamble(self) -> str: "You are an assistant that helps improve agent configurations through iterative optimization.", "", "Your task is to generate improved agent instructions and parameters based on the feedback provided.", + "The feedback you provide should guide the LLM To improve the agent instructions for all possible use cases, not one concrete case.", + "For example, if the feedback is that the agent is not returning the correct records, you should improve the agent instructions to return the correct records for all possible use cases. Not just the one concrete case that was provided in the feedback.", + "When changing the instructions, keep the original intent in mind when it comes to things like the use of variables and placeholders.", + "If the original instructions were to use a placeholder like {{id}}, you should keep the placeholder in the new instructions, not replace it with the actual value. This is the case for all parameterized values (all parameters should appear in each new variation).", + "Pay particular attention to the instructions regarding tools and the rules for variables." ]) + def _new_variation_prompt_acceptance_criteria(self) -> str: + """ + Acceptance criteria section of the variation prompt. + + Collects every acceptance statement defined across all judges and renders + them as an emphatic block so the LLM understands exactly what the improved + configuration must achieve. Returns an empty string when no judges carry + acceptance statements (e.g. all judges are config-key-only judges). + """ + if not self._options.judges: + return "" + + statements = [ + (key, judge.acceptance_statement) + for key, judge in self._options.judges.items() + if judge.acceptance_statement + ] + + if not statements: + return "" + + lines = [ + "## *** ACCEPTANCE CRITERIA (MUST BE MET) ***", + "The improved configuration MUST produce responses that satisfy ALL of the following criteria.", + "These criteria are non-negotiable — every generated variation will be evaluated against them.", + "", + ] + for key, statement in statements: + lines.append(f"- [{key}] {statement}") + + lines += [ + "", + "When writing new instructions, explicitly address each criterion above.", + "Do not sacrifice any criterion in favour of another.", + ] + + return "\n".join(lines) + def _new_variation_prompt_configuration( - self, previous_ctx: Optional[OptimizationContext] + self, history: List[OptimizationContext] ) -> str: """ Configuration section of the variation prompt. - Shows the previous iteration's model, instructions, parameters, and completion - response when available, or the current instance state on the first attempt. + Shows the most recent iteration's model, instructions, parameters, + user input, and completion response when history is available, or the + current instance state on the first attempt. """ - if previous_ctx: - return "\n".join([ - "## Previous Configuration:", + if history: + previous_ctx = history[-1] + lines = [ + "## Most Recent Configuration:", f"Model: {previous_ctx.current_model}", f"Instructions: {previous_ctx.current_instructions}", f"Parameters: {previous_ctx.current_parameters}", "", - "## Previous Result:", - previous_ctx.completion_response, - ]) + "## Most Recent Result:", + ] + if previous_ctx.user_input: + lines.append(f"User question: {previous_ctx.user_input}") + lines.append(f"Agent response: {previous_ctx.completion_response}") + return "\n".join(lines) else: return "\n".join([ "## Current Configuration:", @@ -438,50 +804,55 @@ def _new_variation_prompt_configuration( ]) def _new_variation_prompt_feedback( - self, previous_ctx: Optional[OptimizationContext] + self, history: List[OptimizationContext] ) -> str: """ Evaluation feedback section of the variation prompt. - Returns an empty string when there are no scores so it is filtered out - of the assembled prompt entirely. + Renders all previous iterations' scores in chronological order so the + LLM can observe trends across the full optimization run. Returns an + empty string when no history exists or no iteration has scores, so it + is filtered out of the assembled prompt entirely. """ - if not previous_ctx or not previous_ctx.scores: + iterations_with_scores = [ctx for ctx in history if ctx.scores] + if not iterations_with_scores: return "" - lines = ["## Evaluation Feedback:"] - for judge_key, result in previous_ctx.scores.items(): - optimization_judge = ( - self._options.judges.get(judge_key) - if self._options.judges - else None - ) - if optimization_judge: - score = result.score - if optimization_judge.threshold is not None: - passed = score >= optimization_judge.threshold - status = "PASSED" if passed else "FAILED" - feedback_line = f"- {judge_key}: Score {score:.3f} (threshold: {optimization_judge.threshold}) - {status}" - if result.rationale: - feedback_line += f"\n Reasoning: {result.rationale}" - lines.append(feedback_line) - else: - passed = score >= 1.0 - status = "PASSED" if passed else "FAILED" - feedback_line = f"- {judge_key}: {status}" + lines = ["## Evaluation History:"] + for ctx in iterations_with_scores: + lines.append(f"\n### Iteration {ctx.iteration}:") + if ctx.user_input: + lines.append(f"User question: {ctx.user_input}") + for judge_key, result in ctx.scores.items(): + optimization_judge = ( + self._options.judges.get(judge_key) + if self._options.judges + else None + ) + if optimization_judge: + score = result.score + if optimization_judge.threshold is not None: + passed = score >= optimization_judge.threshold + status = "PASSED" if passed else "FAILED" + feedback_line = f"- {judge_key}: Score {score:.3f} (threshold: {optimization_judge.threshold}) - {status}" + else: + passed = score >= 1.0 + status = "PASSED" if passed else "FAILED" + feedback_line = f"- {judge_key}: {status}" if result.rationale: feedback_line += f"\n Reasoning: {result.rationale}" lines.append(feedback_line) return "\n".join(lines) def _new_variation_prompt_improvement_instructions( - self, previous_ctx: Optional[OptimizationContext] + self, history: List[OptimizationContext] ) -> str: """ Improvement instructions section of the variation prompt. - Includes model-choice guidance and the required output format schema. - When previous_ctx is provided, adds feedback-driven improvement directives. + Includes model-choice guidance, prompt variable rules, and the required + output format schema. When history is non-empty, adds feedback-driven + improvement directives. """ model_instructions = "\n".join([ "You may also choose to change the model if you believe that the current model is not performing well or a different model would be better suited for the task. " @@ -489,6 +860,69 @@ def _new_variation_prompt_improvement_instructions( "When suggesting a new model, you should provide a rationale for why you believe the new model would be better suited for the task.", ]) + # Collect unique variable keys across all variable_choices entries + variable_keys: set = set() + for choice in self._options.variable_choices: + variable_keys.update(choice.keys()) + placeholder_list = ", ".join(f"{{{{{k}}}}}" for k in sorted(variable_keys)) + + variable_instructions = "\n".join([ + "## Prompt Variables:", + "These variables are substituted into the instructions at call time using {{variable_name}} syntax.", + "Rules:", + "- If the {{variable_name}} placeholder is not present in the current instructions, you should include it where logically appropriate.", + "Here are the original instructions so that you can see how the placeholders are used and which are available:", + "\nSTART:" + "\n" + self._initial_instructions + "\n", + "\nEND OF ORIGINAL INSTRUCTIONS\n", + "The following prompt variables are available and are the only variables that should be used: {placeholder_list}" + "Here is an example of a good response if an {{id}} placeholder is available: 'Select records matching id {{id}}'", + "Here is an example of a bad response if an {{id}} placeholder is available: 'Select records matching id 1232'", + "Here is an example of a good response if a {{resource_id}} and {{resource_type}} placeholder are available: 'Select records matching id {{resource_id}} and type {{resource_type}}'", + "Here is an example of a bad response if a {{resource_id}} and {{resource_type}} placeholder are available: 'Select records matching id 1232 and type {{resource_type}}'", + "Here is another example of a bad response if a {{resource_id}} and {{resource_type}} placeholder are available: 'Select records matching id {{resource_id}} and type resource-123'", + ]) + + tool_instructions = "\n".join([ + "## Tool Format:", + "If the current configuration includes tools, you MUST return them unchanged in current_parameters[\"tools\"].", + "Do NOT include internal framework tools such as the evaluation tool or structured output tool.", + "Each tool must follow this exact format:", + "{", + ' "name": "tool-name",', + ' "type": "function",', + ' "description": "What the tool does",', + ' "parameters": {', + ' "type": "object",', + ' "properties": {', + ' "param_name": {', + ' "type": "type of the input parameter",', + ' "description": "Description of the parameter"', + " }", + " },", + ' "required": ["param_name"],', + ' "additionalProperties": false', + " }", + "}", + "Example:", + "{", + ' "name": "user-preferences-lookup",', + ' "type": "function",', + ' "description": "Looks up user preferences by ID",', + ' "parameters": {', + ' "type": "object",', + ' "properties": {', + ' "user_id": {', + ' "type": "string",', + ' "description": "The user id"', + " }", + " },", + ' "required": ["user_id"],', + ' "additionalProperties": false', + " }", + "}", + ]) + parameters_instructions = "\n".join([ "Return these values in a JSON object with the following keys: current_instructions, current_parameters, and model.", "Example:", @@ -503,24 +937,34 @@ def _new_variation_prompt_improvement_instructions( "Do not include any other parameters that are not directly parseable by an LLM call. If you want to provide instruction for tone or other attributes, provide them directly in the instructions.", ]) - if previous_ctx: + if history: return "\n".join([ "## Improvement Instructions:", - "Based on the evaluation feedback above, generate improved agent instructions and parameters.", + "Based on the evaluation history above, generate improved agent instructions and parameters.", "Focus on addressing the areas where the evaluation failed or scored below threshold.", "The new configuration should aim to improve the agent's performance on the evaluation criteria.", model_instructions, "", + variable_instructions, + "", + tool_instructions, + "", "Return the improved configuration in a structured format that can be parsed to update:", "1. The agent instructions (current_instructions)", "2. The agent parameters (current_parameters)", "3. The model (model) - you must always return a model, even if it's the same as the current model.", + "4. You should return the tools the user has defined, as-is, on the new parameters. Do not modify them, but make sure you do not include internal tools like the evaluation tool or structured output tool.", parameters_instructions, ]) else: return "\n".join([ "Generate an improved version of this configuration.", model_instructions, + "", + variable_instructions, + "", + tool_instructions, + "", parameters_instructions, ]) @@ -554,7 +998,7 @@ def _apply_new_variation_response( if missing_fields: logger.error( - "[Turn %d] -> Response missing required fields: %s. Received fields: %s. Full response_data: %s", + "[Iteration %d] -> Response missing required fields: %s. Received fields: %s. Full response_data: %s", iteration, ", ".join(missing_fields), list(response_data.keys()), @@ -576,13 +1020,13 @@ def _apply_new_variation_response( ) if not model_value: logger.warning( - "[Turn %d] -> Model field is empty or None in response, keeping current model %s", + "[Iteration %d] -> Model field is empty or None in response, keeping current model %s", iteration, self._current_model, ) elif model_value not in self._options.model_choices: logger.warning( - "[Turn %d] -> Model '%s' not in model_choices %s, keeping current model %s", + "[Iteration %d] -> Model '%s' not in model_choices %s, keeping current model %s", iteration, model_value, self._options.model_choices, @@ -592,14 +1036,14 @@ def _apply_new_variation_response( old_model = self._current_model self._current_model = model_value logger.info( - "[Turn %d] -> Model updated from '%s' to '%s'", + "[Iteration %d] -> Model updated from '%s' to '%s'", iteration, old_model, self._current_model, ) logger.info( - "[Turn %d] -> New variation generated: instructions='%.100s...', model=%s, parameters=%s", + "[Iteration %d] -> New variation generated: instructions='%s', model=%s, parameters=%s", iteration, self._current_instructions, self._current_model, @@ -612,14 +1056,16 @@ def _apply_new_variation_response( completion_response=response_str, current_instructions=self._current_instructions, current_parameters=self._current_parameters.copy(), + current_variables=variation_ctx.current_variables, current_model=self._current_model, user_input=None, history=variation_ctx.history, iteration=variation_ctx.iteration, - structured_output_tool=variation_ctx.structured_output_tool, ) - async def _generate_new_variation(self, iteration: int) -> OptimizationContext: + async def _generate_new_variation( + self, iteration: int, variables: Dict[str, Any] + ) -> OptimizationContext: """ Generate new variation for next iteration (auto-path). @@ -627,40 +1073,48 @@ async def _generate_new_variation(self, iteration: int) -> OptimizationContext: and current_parameters based on the returned OptimizeContext. :param iteration: The current iteration number for logging + :param variables: The variable set for this iteration, chosen once by the caller """ - logger.info("[Turn %d] -> Generating new variation...", iteration) + logger.info("[Iteration %d] -> Generating new variation...", iteration) # Create a context for status update before generating the variation - status_ctx = self._create_optimization_context(iteration=iteration) + status_ctx = self._create_optimization_context( + iteration=iteration, + variables=variables, + ) self._safe_status_update("generating variation", status_ctx, iteration) - # Get the most recent context for previous result and feedback - previous_ctx = self._history[-1] if self._history else None - - instructions = self._build_new_variation_prompt(previous_ctx) - - # Create structured output tool definition for variation generation - structured_output_tool = create_variation_tool(self._options.model_choices) + instructions = self._build_new_variation_prompt(self._history) # Create a flat history list (without nested history) to avoid exponential growth flat_history = [prev_ctx.copy_without_history() for prev_ctx in self._history] - # Create context for variation generation — low temperature for deterministic output + # Create context for variation generation — low temperature for deterministic output. + # The variation tool is placed in current_parameters["tools"] so it surfaces through + # AIAgentConfig.model.parameters like any other tool, rather than as a separate field. variation_ctx = OptimizationContext( scores={}, completion_response="", current_instructions=instructions, - current_parameters={"temperature": 0.1}, + current_parameters={ + "temperature": 0.1, + "tools": [create_variation_tool(self._options.model_choices).to_dict()], + }, + current_variables=variables, current_model=self._current_model, user_input=None, history=tuple(flat_history), iteration=len(self._history) + 1, - structured_output_tool=structured_output_tool, ) # Call handle_agent_call to generate new variation; expects a JSON string # matching the structured output schema (current_instructions, current_parameters, model) - result = self._options.handle_agent_call(self._agent_key, variation_ctx) + result = self._options.handle_agent_call( + self._agent_key, + self._build_agent_config_for_context(variation_ctx), + variation_ctx, + self._builtin_agent_tool_handlers(is_variation=True), + ) response_str = await await_if_needed(result) # Extract and update current state from the parsed response @@ -684,8 +1138,6 @@ async def optimize_from_config( ) self._agent_key = agent_key - agent_config = await self._get_agent_config(agent_key) - raise NotImplementedError async def _execute_agent_turn( @@ -697,17 +1149,35 @@ async def _execute_agent_turn( Run the agent call and judge scoring for one optimization turn. Returns a new OptimizationContext with completion_response and scores - populated, leaving the input context unchanged. + populated, leaving the input context unchanged. Variables are read from + optimize_context.current_variables and interpolated into the agent's + instructions at call time so the stored template is never mutated. :param optimize_context: The context for this turn (instructions, model, history, etc.) :param iteration: Current iteration number for logging and status callbacks :return: Updated context with completion_response and scores filled in """ + logger.info( + "[Iteration %d] -> Calling agent (model=%s)...", + iteration, + optimize_context.current_model, + ) try: - result = self._options.handle_agent_call(self._agent_key, optimize_context) + result = self._options.handle_agent_call( + self._agent_key, + self._build_agent_config_for_context(optimize_context), + optimize_context, + self._builtin_agent_tool_handlers(is_variation=False), + ) completion_response = await await_if_needed(result) + logger.info( + "[Iteration %d] -> Agent response: %.300s%s", + iteration, + completion_response, + "..." if len(completion_response) > 300 else "", + ) except Exception: - logger.exception("[Turn %d] -> Agent call failed", iteration) + logger.exception("[Iteration %d] -> Agent call failed", iteration) if self._options.on_failing_result: self._options.on_failing_result(optimize_context) raise @@ -715,7 +1185,14 @@ async def _execute_agent_turn( scores: Dict[str, JudgeResult] = {} if self._options.judges: self._safe_status_update("evaluating", optimize_context, iteration) - scores = await self._call_judges(completion_response, iteration) + agent_tools = self._extract_agent_tools(optimize_context.current_parameters) + scores = await self._call_judges( + completion_response, + iteration, + user_input=optimize_context.user_input, + variables=optimize_context.current_variables, + agent_tools=agent_tools, + ) return dataclasses.replace( optimize_context, @@ -723,6 +1200,83 @@ async def _execute_agent_turn( scores=scores, ) + def _evaluate_response(self, optimize_context: OptimizationContext) -> bool: + """ + Determine whether the current iteration's scores meet all judge thresholds. + + A judge without an explicit threshold is treated as requiring a perfect + score of 1.0. Returns True immediately when no judges are configured. + + :param optimize_context: The completed turn context containing scores + :return: True if all judges passed, False if any judge failed or is missing + """ + if not self._options.judges: + return True + + for judge_key, optimization_judge in self._options.judges.items(): + result = optimize_context.scores.get(judge_key) + if result is None: + return False + threshold = ( + optimization_judge.threshold + if optimization_judge.threshold is not None + else 1.0 + ) + if result.score < threshold: + return False + + return True + + def _handle_success( + self, optimize_context: OptimizationContext, iteration: int + ) -> Any: + """ + Handle a successful optimization result. + + Fires the "success" status update, invokes on_passing_result if set, + and returns the winning OptimizationContext. + + :param optimize_context: The context from the passing iteration + :param iteration: Current iteration number for logging + :return: The passing OptimizationContext + """ + logger.info("[Iteration %d] -> Optimization succeeded", iteration) + self._safe_status_update("success", optimize_context, iteration) + if self._options.on_passing_result: + try: + self._options.on_passing_result(optimize_context) + except Exception: + logger.exception( + "[Iteration %d] -> on_passing_result callback failed", iteration + ) + return optimize_context + + def _handle_failure( + self, optimize_context: OptimizationContext, iteration: int + ) -> Any: + """ + Handle a failed optimization result (max attempts reached). + + Fires the "failure" status update, invokes on_failing_result if set, + and returns the last OptimizationContext. + + :param optimize_context: The context from the final iteration + :param iteration: Current iteration number for logging + :return: The last OptimizationContext + """ + logger.warning( + "[Optimization] -> Optimization failed after %d attempt(s)", iteration + ) + self._safe_status_update("failure", optimize_context, iteration) + if self._options.on_failing_result: + try: + self._options.on_failing_result(optimize_context) + except Exception: + logger.exception( + "[Iteration %d] -> on_failing_result callback failed", iteration + ) + return optimize_context + async def _run_optimization( self, agent_config: AIAgentConfig, options: OptimizationOptions ) -> Any: @@ -733,10 +1287,12 @@ async def _run_optimization( :return: Optimization result. """ self._options = options + self._agent_config = agent_config self._initialize_class_members_from_config(agent_config) initial_context = self._create_optimization_context( iteration=0, + variables=random.choice(options.variable_choices), ) self._safe_status_update("init", initial_context, 0) @@ -744,14 +1300,21 @@ async def _run_optimization( iteration = 0 while True: iteration += 1 - logger.info("[Turn %d] -> Starting", iteration) + logger.info( + "[Iteration %d] -> Starting (attempt %d/%d, model=%s)", + iteration, iteration, self._options.max_attempts, self._current_model, + ) user_input = None if self._options.user_input_options: user_input = random.choice(self._options.user_input_options) + if user_input: + logger.info("[Iteration %d] -> User input: %s", iteration, user_input) optimize_context = self._create_optimization_context( iteration=iteration, user_input=user_input, + # Pick a fresh variable set each turn for call-time interpolation + variables=random.choice(self._options.variable_choices), ) self._safe_status_update("generating", optimize_context, iteration) @@ -762,25 +1325,27 @@ async def _run_optimization( try: on_turn_result = self._options.on_turn(optimize_context) if on_turn_result: - # on_turn returned True — success + logger.info("[Iteration %d] -> on_turn returned True — turn passed", iteration) return self._handle_success(optimize_context, iteration) else: - # on_turn returned False — generate new variation and continue + logger.info( + "[Iteration %d] -> on_turn returned False — turn failed (attempt %d/%d)", + iteration, iteration, self._options.max_attempts, + ) if iteration >= self._options.max_attempts: return self._handle_failure(optimize_context, iteration) self._history.append(optimize_context) - await self._generate_new_variation(iteration) - # Notify before starting next turn + await self._generate_new_variation(iteration, optimize_context.current_variables) self._safe_status_update( "turn completed", optimize_context, iteration ) continue except Exception as e: logger.exception( - "[Turn %d] -> on_turn evaluation failed", iteration + "[Iteration %d] -> on_turn evaluation failed", iteration ) self._history.append(optimize_context) - await self._generate_new_variation(iteration) + await self._generate_new_variation(iteration, optimize_context.current_variables) if iteration >= self._options.max_attempts: return self._handle_failure(optimize_context, iteration) self._safe_status_update( @@ -791,11 +1356,15 @@ async def _run_optimization( # Auto-path: judge scores determine pass/fail via _evaluate_response passes = self._evaluate_response(optimize_context) if passes: + logger.info("[Iteration %d] -> All judges passed — turn succeeded", iteration) return self._handle_success(optimize_context, iteration) else: + logger.info( + "[Iteration %d] -> One or more judges failed (attempt %d/%d) — generating new variation", + iteration, iteration, self._options.max_attempts, + ) self._history.append(optimize_context) - await self._generate_new_variation(iteration) - # Check max_attempts after generating variation + await self._generate_new_variation(iteration, optimize_context.current_variables) if iteration >= self._options.max_attempts: return self._handle_failure(optimize_context, iteration) self._safe_status_update( diff --git a/packages/optimization/src/ldai_optimization/dataclasses.py b/packages/optimization/src/ldai_optimization/dataclasses.py index d57b346..bda41ab 100644 --- a/packages/optimization/src/ldai_optimization/dataclasses.py +++ b/packages/optimization/src/ldai_optimization/dataclasses.py @@ -15,6 +15,7 @@ Union, ) +from ldai import AIAgentConfig from ldclient import Context @@ -50,6 +51,7 @@ class StructuredOutputTool: name: str description: str input_schema: Dict[str, Any] # JSON schema defining the expected output structure + type: Literal["function"] = "function" def to_dict(self) -> Dict[str, Any]: """ @@ -61,8 +63,25 @@ def to_dict(self) -> Dict[str, Any]: "name": self.name, "description": self.description, "input_schema": self.input_schema, + "type": self.type, } + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "StructuredOutputTool": + """ + Construct a StructuredOutputTool from a plain dictionary. + + :param data: Dictionary with at least a ``name`` key; ``description`` and + ``input_schema`` default to empty values when absent. + :return: A new StructuredOutputTool instance + """ + return cls( + name=data.get("name", ""), + description=data.get("description", ""), + input_schema=data.get("input_schema", {}), + type=data.get("type", "function"), + ) + @dataclass class Message: @@ -102,15 +121,13 @@ class OptimizationContext: completion_response: str current_instructions: str current_parameters: Dict[str, Any] + current_variables: Dict[str, Any] # variable set chosen for this iteration; interpolated into instructions at call time current_model: Optional[str] = None # the current model being used user_input: Optional[str] = None # the user input message for this iteration history: Sequence[OptimizationContext] = field( default_factory=list ) # previous context items iteration: int = 0 # current iteration number - structured_output_tool: Optional[StructuredOutputTool] = ( - None # tool definition for structured output - ) def copy_without_history(self) -> OptimizationContext: """ @@ -123,11 +140,11 @@ def copy_without_history(self) -> OptimizationContext: completion_response=self.completion_response, current_instructions=self.current_instructions, current_parameters=self.current_parameters, + current_variables=self.current_variables, current_model=self.current_model, user_input=self.user_input, history=(), # Empty history to keep it flat iteration=self.iteration, - structured_output_tool=self.structured_output_tool, ) def to_json(self) -> Dict[str, Any]: @@ -140,10 +157,6 @@ def to_json(self) -> Dict[str, Any]: for judge_key, judge_result in self.scores.items(): scores_dict[judge_key] = judge_result.to_json() - structured_output_tool_dict = None - if self.structured_output_tool: - structured_output_tool_dict = self.structured_output_tool.to_dict() - history_list = [ctx.to_json() for ctx in self.history] return { @@ -153,19 +166,18 @@ def to_json(self) -> Dict[str, Any]: "current_parameters": self.current_parameters, "current_model": self.current_model, "user_input": self.user_input, + "current_variables": self.current_variables, "history": history_list, "iteration": self.iteration, - "structured_output_tool": structured_output_tool_dict, } @dataclass class OptimizationJudgeContext: - """Context for judge evaluation.""" + """Context for a single judge evaluation turn.""" - messages: List[Message] - parameters: Dict[str, Any] - tools: Optional[List[Dict[str, Any]]] = None + user_input: str # the agent response being evaluated + variables: Dict[str, Any] = field(default_factory=dict) # variable set used during agent generation @dataclass @@ -183,12 +195,12 @@ class OptimizationOptions: ] # choices of interpolated variables to be chosen at random per turn, 1 min required # Actual agent/completion (judge) calls - Required handle_agent_call: Union[ - Callable[[str, OptimizationContext], str], - Callable[[str, OptimizationContext], Awaitable[str]], + Callable[[str, AIAgentConfig, OptimizationContext, Dict[str, Callable[..., Any]]], str], + Callable[[str, AIAgentConfig, OptimizationContext, Dict[str, Callable[..., Any]]], Awaitable[str]], ] handle_judge_call: Union[ - Callable[[str, OptimizationContext], str], - Callable[[str, OptimizationJudgeContext], Awaitable[str]], + Callable[[str, AIAgentConfig, OptimizationJudgeContext, Dict[str, Callable[..., Any]]], str], + Callable[[str, AIAgentConfig, OptimizationJudgeContext, Dict[str, Callable[..., Any]]], Awaitable[str]], ] # Criteria for pass/fail - Optional user_input_options: Optional[List[str]] = ( @@ -230,8 +242,6 @@ def __post_init__(self): raise ValueError("context_choices must have at least 1 context") if len(self.model_choices) < 1: raise ValueError("model_choices must have at least 1 model") - if len(self.variable_choices) < 1: - raise ValueError("variable_choices must have at least 1 variable choice") if self.judges is None and self.on_turn is None: raise ValueError("Either judges or on_turn must be provided") if self.judge_model is None: diff --git a/packages/optimization/src/ldai_optimization/util.py b/packages/optimization/src/ldai_optimization/util.py index 1f48405..b64a1b5 100644 --- a/packages/optimization/src/ldai_optimization/util.py +++ b/packages/optimization/src/ldai_optimization/util.py @@ -11,6 +11,66 @@ logger = logging.getLogger(__name__) +def handle_evaluation_tool_call(score: float, rationale: str) -> str: + """ + Process the return_evaluation tool call from the judge LLM. + + Serialises the score and rationale to a JSON string. The caller + (handle_judge_call implementor) should return this string as the result of + the judge turn; the framework will then parse it via _parse_judge_response + to extract the score and rationale. + + :param score: The evaluation score (0.0 to 1.0) + :param rationale: Explanation of the evaluation decision + :return: JSON string of the score and rationale + """ + return json.dumps({"score": score, "rationale": rationale}) + + +def handle_variation_tool_call( + current_instructions: str, + current_parameters: Dict[str, Any], + model: str, +) -> str: + """ + Process the return_improved_configuration tool call from the variation LLM. + + Serialises the improved configuration to a JSON string. The caller + (handle_agent_call implementor) should return this string as the result of + the variation agent turn; the framework will then parse it via + extract_json_from_response and apply it in _apply_new_variation_response. + + :param current_instructions: The improved agent instructions + :param current_parameters: The improved agent parameters (e.g. temperature, max_tokens) + :param model: The model to use for the improved agent + :return: JSON string of the improved configuration + """ + return json.dumps({ + "current_instructions": current_instructions, + "current_parameters": current_parameters, + "model": model, + }) + + +def interpolate_variables(text: str, variables: Dict[str, Any]) -> str: + """ + Interpolate ``{{variable}}`` placeholders in text using the provided variables. + + Matches LaunchDarkly's Mustache-style template format so that manually + generated variation instructions use the same syntax as LD-fetched templates. + Unrecognised placeholders are left unchanged. + + :param text: Template string potentially containing ``{{key}}`` placeholders + :param variables: Mapping of variable names to their replacement values + :return: Text with all recognised placeholders replaced + """ + def replace(match: re.Match) -> str: + key = match.group(1).strip() + return str(variables[key]) if key in variables else match.group(0) + + return re.sub(r"\{\{(\w+)\}\}", replace, text) + + async def await_if_needed( result: Union[str, Awaitable[str]] ) -> str: @@ -33,6 +93,7 @@ def create_evaluation_tool() -> StructuredOutputTool: :return: A StructuredOutputTool for evaluation responses """ return StructuredOutputTool( + type="function", name="return_evaluation", description="Returns an evaluation with a score and rationale.", input_schema={ @@ -59,6 +120,7 @@ def create_boolean_tool() -> StructuredOutputTool: :return: A StructuredOutputTool for boolean evaluation responses """ return StructuredOutputTool( + type="function", name="return_boolean", description="Returns a boolean value and reasoning for the evaluation.", input_schema={ @@ -86,6 +148,7 @@ def create_variation_tool(model_choices: List[str]) -> StructuredOutputTool: :return: A StructuredOutputTool for variation generation responses """ return StructuredOutputTool( + type="function", name="return_improved_configuration", description=( "Returns the improved agent configuration with updated instructions and parameters. " @@ -189,9 +252,9 @@ def extract_json_from_response(response_str: str) -> Dict[str, Any]: if response_data is None: logger.error( "Failed to extract JSON from response. " - "Response length: %d, first 200 chars: %s", + "Response length: %d, response: %s", len(response_str), - response_str[:200], + response_str, ) raise ValueError( "Failed to parse structured output from variation generation. " diff --git a/packages/optimization/tests/test_client.py b/packages/optimization/tests/test_client.py new file mode 100644 index 0000000..34a7bb3 --- /dev/null +++ b/packages/optimization/tests/test_client.py @@ -0,0 +1,901 @@ +"""Tests for OptimizationClient.""" + +import json +from typing import Any, Dict +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from ldai import AIAgentConfig, AIJudgeConfig, AIJudgeConfigDefault, LDAIClient +from ldai.models import LDMessage, ModelConfig +from ldclient import Context + +from ldai_optimization.client import OptimizationClient +from ldai_optimization.dataclasses import ( + JudgeResult, + OptimizationContext, + OptimizationJudge, + OptimizationJudgeContext, + OptimizationOptions, + StructuredOutputTool, +) +from ldai_optimization.util import ( + create_evaluation_tool, + create_variation_tool, + handle_evaluation_tool_call, + handle_variation_tool_call, +) + +# --------------------------------------------------------------------------- +# Shared helpers / fixtures +# --------------------------------------------------------------------------- + +LD_CONTEXT = Context.create("test-user") + +AGENT_INSTRUCTIONS = "You are a helpful assistant. Answer using {{language}}." +VARIATION_RESPONSE = json.dumps({ + "current_instructions": "You are an improved assistant.", + "current_parameters": {"temperature": 0.5}, + "model": "gpt-4o", +}) +JUDGE_PASS_RESPONSE = json.dumps({"score": 1.0, "rationale": "Perfect answer."}) +JUDGE_FAIL_RESPONSE = json.dumps({"score": 0.2, "rationale": "Off topic."}) + + +def _make_agent_config( + instructions: str = AGENT_INSTRUCTIONS, + model_name: str = "gpt-4o", + parameters: Dict[str, Any] | None = None, +) -> AIAgentConfig: + return AIAgentConfig( + key="test-agent", + enabled=True, + model=ModelConfig(name=model_name, parameters=parameters or {}), + instructions=instructions, + ) + + +def _make_ldai_client(agent_config: AIAgentConfig | None = None) -> MagicMock: + mock = MagicMock(spec=LDAIClient) + mock.agent_config.return_value = agent_config or _make_agent_config() + mock._client = MagicMock() + mock._client.variation.return_value = {"instructions": AGENT_INSTRUCTIONS} + return mock + + +def _make_options( + *, + handle_agent_call=None, + handle_judge_call=None, + judges=None, + max_attempts: int = 3, + variable_choices=None, +) -> OptimizationOptions: + if handle_agent_call is None: + handle_agent_call = AsyncMock(return_value="The capital of France is Paris.") + if handle_judge_call is None: + handle_judge_call = AsyncMock(return_value=JUDGE_PASS_RESPONSE) + if judges is None: + judges = { + "accuracy": OptimizationJudge( + threshold=0.8, + acceptance_statement="The response must be accurate and concise.", + ) + } + return OptimizationOptions( + context_choices=[LD_CONTEXT], + max_attempts=max_attempts, + model_choices=["gpt-4o", "gpt-4o-mini"], + judge_model="gpt-4o", + variable_choices=variable_choices or [{"language": "English"}], + handle_agent_call=handle_agent_call, + handle_judge_call=handle_judge_call, + judges=judges, + ) + + +def _make_client(ldai: MagicMock | None = None) -> OptimizationClient: + client = OptimizationClient(ldai or _make_ldai_client()) + return client + + +# --------------------------------------------------------------------------- +# Util functions +# --------------------------------------------------------------------------- + + +class TestHandleEvaluationToolCall: + def test_returns_json_with_score_and_rationale(self): + result = handle_evaluation_tool_call(score=0.8, rationale="Good answer.") + data = json.loads(result) + assert data["score"] == 0.8 + assert data["rationale"] == "Good answer." + + def test_score_zero_is_valid(self): + result = handle_evaluation_tool_call(score=0.0, rationale="No match.") + assert json.loads(result)["score"] == 0.0 + + def test_result_is_valid_json_string(self): + result = handle_evaluation_tool_call(score=0.5, rationale="Partial.") + assert isinstance(result, str) + json.loads(result) # must not raise + + +class TestHandleVariationToolCall: + def test_returns_json_with_all_fields(self): + result = handle_variation_tool_call( + current_instructions="Do X.", + current_parameters={"temperature": 0.7}, + model="gpt-4o", + ) + data = json.loads(result) + assert data["current_instructions"] == "Do X." + assert data["current_parameters"] == {"temperature": 0.7} + assert data["model"] == "gpt-4o" + + def test_result_is_valid_json_string(self): + result = handle_variation_tool_call( + current_instructions="Do Y.", + current_parameters={}, + model="gpt-4o-mini", + ) + assert isinstance(result, str) + json.loads(result) + + +# --------------------------------------------------------------------------- +# _extract_agent_tools +# --------------------------------------------------------------------------- + + +class TestExtractAgentTools: + def setup_method(self): + self.client = _make_client() + self.client._agent_key = "test-agent" + self.client._options = _make_options() + self.client._agent_config = _make_agent_config() + self.client._initialize_class_members_from_config(_make_agent_config()) + + def test_returns_empty_list_when_no_tools(self): + result = self.client._extract_agent_tools({}) + assert result == [] + + def test_returns_empty_list_when_tools_key_is_empty(self): + result = self.client._extract_agent_tools({"tools": []}) + assert result == [] + + def test_returns_structured_output_tool_from_dict(self): + tool_dict = { + "name": "lookup", + "description": "Looks up data", + "input_schema": {"type": "object", "properties": {}}, + } + result = self.client._extract_agent_tools({"tools": [tool_dict]}) + assert len(result) == 1 + assert isinstance(result[0], StructuredOutputTool) + assert result[0].name == "lookup" + + def test_passes_through_existing_structured_output_tool(self): + tool = StructuredOutputTool( + name="my-tool", description="desc", input_schema={} + ) + result = self.client._extract_agent_tools({"tools": [tool]}) + assert result == [tool] + + def test_wraps_single_non_list_tool(self): + tool_dict = {"name": "single", "description": "x", "input_schema": {}} + result = self.client._extract_agent_tools({"tools": tool_dict}) + assert len(result) == 1 + assert result[0].name == "single" + + def test_converts_object_with_to_dict(self): + mock_tool = MagicMock() + mock_tool.to_dict.return_value = { + "name": "converted", + "description": "via to_dict", + "input_schema": {}, + } + result = self.client._extract_agent_tools({"tools": [mock_tool]}) + assert len(result) == 1 + assert result[0].name == "converted" + + +# --------------------------------------------------------------------------- +# _evaluate_response +# --------------------------------------------------------------------------- + + +class TestEvaluateResponse: + def setup_method(self): + self.client = _make_client() + self.client._options = _make_options() + + def _ctx_with_scores(self, scores: Dict[str, JudgeResult]) -> OptimizationContext: + return OptimizationContext( + scores=scores, + completion_response="Some response.", + current_instructions="Do X.", + current_parameters={}, + current_variables={}, + iteration=1, + ) + + def test_passes_when_all_judges_meet_threshold(self): + ctx = self._ctx_with_scores({"accuracy": JudgeResult(score=0.9)}) + assert self.client._evaluate_response(ctx) is True + + def test_fails_when_judge_below_threshold(self): + ctx = self._ctx_with_scores({"accuracy": JudgeResult(score=0.5)}) + assert self.client._evaluate_response(ctx) is False + + def test_fails_when_judge_result_missing(self): + ctx = self._ctx_with_scores({}) + assert self.client._evaluate_response(ctx) is False + + def test_passes_at_exact_threshold(self): + ctx = self._ctx_with_scores({"accuracy": JudgeResult(score=0.8)}) + assert self.client._evaluate_response(ctx) is True + + def test_no_judges_always_passes(self): + options = _make_options(judges=None, handle_agent_call=AsyncMock(return_value="x")) + # Need on_turn to satisfy validation — inject directly + options_with_on_turn = OptimizationOptions( + context_choices=[LD_CONTEXT], + max_attempts=1, + model_choices=["gpt-4o"], + judge_model="gpt-4o", + variable_choices=[{}], + handle_agent_call=AsyncMock(return_value="x"), + handle_judge_call=AsyncMock(return_value=JUDGE_PASS_RESPONSE), + judges={"j": OptimizationJudge(threshold=1.0, acceptance_statement="x")}, + on_turn=lambda ctx: True, + ) + self.client._options = options_with_on_turn + # Without judges, _evaluate_response returns True + options_no_judges = MagicMock() + options_no_judges.judges = None + self.client._options = options_no_judges + ctx = self._ctx_with_scores({}) + assert self.client._evaluate_response(ctx) is True + + def test_multiple_judges_all_must_pass(self): + self.client._options = _make_options( + judges={ + "a": OptimizationJudge(threshold=0.8, acceptance_statement="A"), + "b": OptimizationJudge(threshold=0.9, acceptance_statement="B"), + } + ) + ctx = self._ctx_with_scores({ + "a": JudgeResult(score=0.9), + "b": JudgeResult(score=0.7), # fails + }) + assert self.client._evaluate_response(ctx) is False + + def test_multiple_judges_all_passing(self): + self.client._options = _make_options( + judges={ + "a": OptimizationJudge(threshold=0.8, acceptance_statement="A"), + "b": OptimizationJudge(threshold=0.8, acceptance_statement="B"), + } + ) + ctx = self._ctx_with_scores({ + "a": JudgeResult(score=0.9), + "b": JudgeResult(score=1.0), + }) + assert self.client._evaluate_response(ctx) is True + + +# --------------------------------------------------------------------------- +# _builtin_judge_tool_handlers / _builtin_agent_tool_handlers +# --------------------------------------------------------------------------- + + +class TestBuiltinToolHandlers: + def setup_method(self): + self.client = _make_client() + self.client._options = _make_options() + + def test_judge_handlers_contains_evaluation_tool(self): + handlers = self.client._builtin_judge_tool_handlers() + assert create_evaluation_tool().name in handlers + + def test_judge_handler_returns_json(self): + handlers = self.client._builtin_judge_tool_handlers() + result = handlers[create_evaluation_tool().name](score=0.7, rationale="ok") + data = json.loads(result) + assert data["score"] == 0.7 + + def test_agent_handlers_empty_for_regular_turn(self): + handlers = self.client._builtin_agent_tool_handlers(is_variation=False) + assert handlers == {} + + def test_agent_handlers_contains_variation_tool_for_variation_turn(self): + handlers = self.client._builtin_agent_tool_handlers(is_variation=True) + expected_name = create_variation_tool(self.client._options.model_choices).name + assert expected_name in handlers + + def test_variation_handler_returns_valid_json(self): + handlers = self.client._builtin_agent_tool_handlers(is_variation=True) + name = create_variation_tool(self.client._options.model_choices).name + result = handlers[name]( + current_instructions="New instructions.", + current_parameters={"temperature": 0.3}, + model="gpt-4o", + ) + data = json.loads(result) + assert data["current_instructions"] == "New instructions." + assert data["model"] == "gpt-4o" + + +# --------------------------------------------------------------------------- +# _evaluate_acceptance_judge +# --------------------------------------------------------------------------- + + +class TestEvaluateAcceptanceJudge: + def setup_method(self): + self.client = _make_client() + agent_config = _make_agent_config() + self.client._agent_key = "test-agent" + self.client._agent_config = agent_config + self.client._initialize_class_members_from_config(agent_config) + self.handle_judge_call = AsyncMock(return_value=JUDGE_PASS_RESPONSE) + self.client._options = _make_options(handle_judge_call=self.handle_judge_call) + + async def test_returns_parsed_score_and_rationale(self): + judge = OptimizationJudge( + threshold=0.8, acceptance_statement="Must be concise." + ) + result = await self.client._evaluate_acceptance_judge( + judge_key="conciseness", + optimization_judge=judge, + completion_response="Paris.", + iteration=1, + reasoning_history="", + user_input="What is the capital of France?", + ) + assert result.score == 1.0 + assert result.rationale == "Perfect answer." + + async def test_handle_judge_call_receives_correct_key_and_config(self): + judge = OptimizationJudge( + threshold=0.8, acceptance_statement="Must answer the question." + ) + await self.client._evaluate_acceptance_judge( + judge_key="relevance", + optimization_judge=judge, + completion_response="Some answer.", + iteration=1, + reasoning_history="", + user_input="What time is it?", + ) + call_args = self.handle_judge_call.call_args + key, config, ctx, handlers = call_args.args + assert key == "relevance" + assert isinstance(config, AIAgentConfig) + assert isinstance(ctx, OptimizationJudgeContext) + assert "relevance" in create_evaluation_tool().name or True # handlers present + assert create_evaluation_tool().name in handlers + + async def test_acceptance_statement_in_instructions(self): + statement = "Response must mention the Eiffel Tower." + judge = OptimizationJudge(threshold=0.8, acceptance_statement=statement) + await self.client._evaluate_acceptance_judge( + judge_key="tower", + optimization_judge=judge, + completion_response="Paris has the Eiffel Tower.", + iteration=1, + reasoning_history="", + user_input="Tell me about Paris.", + ) + call_args = self.handle_judge_call.call_args + _, config, _, _ = call_args.args + assert statement in config.instructions + + async def test_evaluation_tool_in_config_parameters(self): + judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be brief.") + await self.client._evaluate_acceptance_judge( + judge_key="brevity", + optimization_judge=judge, + completion_response="Yes.", + iteration=1, + reasoning_history="", + user_input="Is Paris in France?", + ) + call_args = self.handle_judge_call.call_args + _, config, _, _ = call_args.args + tools = config.model.get_parameter("tools") or [] + tool_names = [t["name"] for t in tools] + assert create_evaluation_tool().name in tool_names + + async def test_agent_tools_prepended_to_config_tools(self): + agent_tool = StructuredOutputTool( + name="lookup", description="Lookup data", input_schema={} + ) + judge = OptimizationJudge(threshold=0.8, acceptance_statement="Use tool.") + await self.client._evaluate_acceptance_judge( + judge_key="tool-use", + optimization_judge=judge, + completion_response="I looked it up.", + iteration=1, + reasoning_history="", + user_input="Find me something.", + agent_tools=[agent_tool], + ) + call_args = self.handle_judge_call.call_args + _, config, _, _ = call_args.args + tools = config.model.get_parameter("tools") or [] + tool_names = [t["name"] for t in tools] + assert "lookup" in tool_names + assert tool_names.index("lookup") < tool_names.index(create_evaluation_tool().name) + + async def test_variables_in_context(self): + judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.") + variables = {"language": "French", "topic": "geography"} + await self.client._evaluate_acceptance_judge( + judge_key="accuracy", + optimization_judge=judge, + completion_response="Paris.", + iteration=1, + reasoning_history="", + user_input="Capital?", + variables=variables, + ) + call_args = self.handle_judge_call.call_args + _, _, ctx, _ = call_args.args + assert ctx.variables == variables + + async def test_returns_zero_score_on_missing_acceptance_statement(self): + judge = OptimizationJudge(threshold=0.8, acceptance_statement=None) + result = await self.client._evaluate_acceptance_judge( + judge_key="broken", + optimization_judge=judge, + completion_response="Anything.", + iteration=1, + reasoning_history="", + user_input="Hello?", + ) + assert result.score == 0.0 + self.handle_judge_call.assert_not_called() + + async def test_returns_zero_score_on_parse_failure(self): + self.handle_judge_call.return_value = "not json at all" + judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be clear.") + result = await self.client._evaluate_acceptance_judge( + judge_key="clarity", + optimization_judge=judge, + completion_response="Clear answer.", + iteration=1, + reasoning_history="", + user_input="Explain X.", + ) + assert result.score == 0.0 + + +# --------------------------------------------------------------------------- +# _evaluate_config_judge +# --------------------------------------------------------------------------- + + +class TestEvaluateConfigJudge: + def setup_method(self): + self.mock_ldai = _make_ldai_client() + self.client = _make_client(self.mock_ldai) + agent_config = _make_agent_config() + self.client._agent_key = "test-agent" + self.client._agent_config = agent_config + self.client._initialize_class_members_from_config(agent_config) + self.handle_judge_call = AsyncMock(return_value=JUDGE_PASS_RESPONSE) + self.client._options = _make_options(handle_judge_call=self.handle_judge_call) + + def _make_judge_config(self, enabled: bool = True) -> AIJudgeConfig: + return AIJudgeConfig( + key="ld-judge-key", + enabled=enabled, + model=ModelConfig(name="gpt-4o", parameters={}), + messages=[ + LDMessage(role="system", content="You are an evaluator."), + LDMessage(role="user", content="Evaluate this response."), + ], + ) + + async def test_calls_handle_judge_call_with_collapsed_instructions(self): + self.mock_ldai.judge_config.return_value = self._make_judge_config() + judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") + await self.client._evaluate_config_judge( + judge_key="quality", + optimization_judge=judge, + completion_response="Good answer.", + iteration=1, + reasoning_history="", + user_input="What is X?", + ) + call_args = self.handle_judge_call.call_args + key, config, ctx, handlers = call_args.args + assert key == "quality" + assert "You are an evaluator." in config.instructions + assert isinstance(ctx, OptimizationJudgeContext) + + async def test_returns_zero_score_when_judge_disabled(self): + self.mock_ldai.judge_config.return_value = self._make_judge_config(enabled=False) + judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") + result = await self.client._evaluate_config_judge( + judge_key="quality", + optimization_judge=judge, + completion_response="Some answer.", + iteration=1, + reasoning_history="", + user_input="What?", + ) + assert result.score == 0.0 + self.handle_judge_call.assert_not_called() + + async def test_returns_zero_score_when_judge_has_no_messages(self): + judge_config = AIJudgeConfig( + key="ld-judge-key", + enabled=True, + model=ModelConfig(name="gpt-4o", parameters={}), + messages=None, + ) + self.mock_ldai.judge_config.return_value = judge_config + judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") + result = await self.client._evaluate_config_judge( + judge_key="quality", + optimization_judge=judge, + completion_response="Any.", + iteration=1, + reasoning_history="", + user_input="Anything?", + ) + assert result.score == 0.0 + self.handle_judge_call.assert_not_called() + + async def test_template_variables_merged_into_judge_config_call(self): + self.mock_ldai.judge_config.return_value = self._make_judge_config() + judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") + variables = {"language": "Spanish"} + await self.client._evaluate_config_judge( + judge_key="quality", + optimization_judge=judge, + completion_response="Answer.", + iteration=1, + reasoning_history="", + user_input="Q?", + variables=variables, + ) + call_kwargs = self.mock_ldai.judge_config.call_args + passed_vars = call_kwargs.args[3] if call_kwargs.args else call_kwargs.kwargs.get("variables", {}) + assert passed_vars.get("language") == "Spanish" + assert "message_history" in passed_vars + assert "response_to_evaluate" in passed_vars + + async def test_agent_tools_prepended_before_evaluation_tool(self): + self.mock_ldai.judge_config.return_value = self._make_judge_config() + agent_tool = StructuredOutputTool(name="search", description="Search", input_schema={}) + judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") + await self.client._evaluate_config_judge( + judge_key="quality", + optimization_judge=judge, + completion_response="Answer.", + iteration=1, + reasoning_history="", + user_input="Q?", + agent_tools=[agent_tool], + ) + _, config, _, _ = self.handle_judge_call.call_args.args + tools = config.model.get_parameter("tools") or [] + names = [t["name"] for t in tools] + assert "search" in names + assert names.index("search") < names.index(create_evaluation_tool().name) + + +# --------------------------------------------------------------------------- +# _execute_agent_turn +# --------------------------------------------------------------------------- + + +class TestExecuteAgentTurn: + def setup_method(self): + self.agent_response = "Paris is the capital of France." + self.handle_agent_call = AsyncMock(return_value=self.agent_response) + self.handle_judge_call = AsyncMock(return_value=JUDGE_PASS_RESPONSE) + self.client = _make_client() + agent_config = _make_agent_config() + self.client._agent_key = "test-agent" + self.client._agent_config = agent_config + self.client._initialize_class_members_from_config(agent_config) + self.client._options = _make_options( + handle_agent_call=self.handle_agent_call, + handle_judge_call=self.handle_judge_call, + ) + + def _make_context(self, user_input: str = "What is the capital of France?") -> OptimizationContext: + return OptimizationContext( + scores={}, + completion_response="", + current_instructions=AGENT_INSTRUCTIONS, + current_parameters={}, + current_variables={"language": "English"}, + current_model="gpt-4o", + user_input=user_input, + iteration=1, + ) + + async def test_calls_handle_agent_call_with_config_and_context(self): + ctx = self._make_context() + await self.client._execute_agent_turn(ctx, iteration=1) + self.handle_agent_call.assert_called_once() + key, config, passed_ctx, handlers = self.handle_agent_call.call_args.args + assert key == "test-agent" + assert isinstance(config, AIAgentConfig) + assert passed_ctx is ctx + assert handlers == {} + + async def test_completion_response_stored_in_returned_context(self): + ctx = self._make_context() + result = await self.client._execute_agent_turn(ctx, iteration=1) + assert result.completion_response == self.agent_response + + async def test_judge_scores_stored_in_returned_context(self): + ctx = self._make_context() + result = await self.client._execute_agent_turn(ctx, iteration=1) + assert "accuracy" in result.scores + assert result.scores["accuracy"].score == 1.0 + + async def test_variables_interpolated_into_agent_config_instructions(self): + ctx = self._make_context() + await self.client._execute_agent_turn(ctx, iteration=1) + _, config, _, _ = self.handle_agent_call.call_args.args + assert "{{language}}" not in config.instructions + assert "English" in config.instructions + + async def test_raises_on_agent_call_failure(self): + self.handle_agent_call.side_effect = RuntimeError("LLM unavailable") + ctx = self._make_context() + with pytest.raises(RuntimeError, match="LLM unavailable"): + await self.client._execute_agent_turn(ctx, iteration=1) + + +# --------------------------------------------------------------------------- +# _generate_new_variation +# --------------------------------------------------------------------------- + + +class TestGenerateNewVariation: + def setup_method(self): + self.handle_agent_call = AsyncMock(return_value=VARIATION_RESPONSE) + self.client = _make_client() + agent_config = _make_agent_config() + self.client._agent_key = "test-agent" + self.client._agent_config = agent_config + self.client._initial_instructions = AGENT_INSTRUCTIONS + self.client._initialize_class_members_from_config(agent_config) + self.client._options = _make_options(handle_agent_call=self.handle_agent_call) + + async def test_updates_current_instructions(self): + await self.client._generate_new_variation(iteration=1, variables={"language": "English"}) + assert self.client._current_instructions == "You are an improved assistant." + + async def test_updates_current_parameters(self): + await self.client._generate_new_variation(iteration=1, variables={}) + assert self.client._current_parameters == {"temperature": 0.5} + + async def test_updates_current_model(self): + await self.client._generate_new_variation(iteration=1, variables={}) + assert self.client._current_model == "gpt-4o" + + async def test_variation_tool_in_agent_config(self): + await self.client._generate_new_variation(iteration=1, variables={}) + _, config, _, _ = self.handle_agent_call.call_args.args + tools = config.model.get_parameter("tools") or [] + tool_names = [t["name"] for t in tools] + assert create_variation_tool(self.client._options.model_choices).name in tool_names + + async def test_builtin_handlers_passed_for_variation(self): + await self.client._generate_new_variation(iteration=1, variables={}) + _, _, _, handlers = self.handle_agent_call.call_args.args + expected_name = create_variation_tool(self.client._options.model_choices).name + assert expected_name in handlers + + async def test_model_not_updated_when_not_in_model_choices(self): + bad_response = json.dumps({ + "current_instructions": "New instructions.", + "current_parameters": {}, + "model": "some-unknown-model", + }) + self.handle_agent_call.return_value = bad_response + original_model = self.client._current_model + await self.client._generate_new_variation(iteration=1, variables={}) + assert self.client._current_model == original_model + + +# --------------------------------------------------------------------------- +# Full optimization loop +# --------------------------------------------------------------------------- + + +class TestRunOptimization: + def setup_method(self): + self.mock_ldai = _make_ldai_client() + + async def test_succeeds_on_first_attempt_when_judge_passes(self): + handle_agent_call = AsyncMock(return_value="The capital of France is Paris.") + handle_judge_call = AsyncMock(return_value=JUDGE_PASS_RESPONSE) + client = _make_client(self.mock_ldai) + options = _make_options( + handle_agent_call=handle_agent_call, + handle_judge_call=handle_judge_call, + ) + result = await client.optimize_from_options("test-agent", options) + assert result.scores["accuracy"].score == 1.0 + handle_agent_call.assert_called_once() + + async def test_generates_variation_when_judge_fails(self): + agent_responses = [ + "Bad answer.", + VARIATION_RESPONSE, # variation generation + "Better answer.", + ] + handle_agent_call = AsyncMock(side_effect=agent_responses) + judge_responses = [JUDGE_FAIL_RESPONSE, JUDGE_PASS_RESPONSE] + handle_judge_call = AsyncMock(side_effect=judge_responses) + client = _make_client(self.mock_ldai) + options = _make_options( + handle_agent_call=handle_agent_call, + handle_judge_call=handle_judge_call, + max_attempts=3, + ) + result = await client.optimize_from_options("test-agent", options) + assert result.scores["accuracy"].score == 1.0 + assert handle_agent_call.call_count == 3 # 1 agent + 1 variation + 1 agent + + async def test_returns_last_context_after_max_attempts(self): + # The loop always calls _generate_new_variation before the max_attempts + # guard, so each of the 3 failing iterations produces a variation call. + handle_agent_call = AsyncMock(side_effect=[ + "Bad answer.", # iteration 1: agent + VARIATION_RESPONSE, # iteration 1: variation + "Still bad.", # iteration 2: agent + VARIATION_RESPONSE, # iteration 2: variation + "Still bad.", # iteration 3: agent + VARIATION_RESPONSE, # iteration 3: variation (before max-attempts guard) + ]) + handle_judge_call = AsyncMock(return_value=JUDGE_FAIL_RESPONSE) + client = _make_client(self.mock_ldai) + options = _make_options( + handle_agent_call=handle_agent_call, + handle_judge_call=handle_judge_call, + max_attempts=3, + ) + result = await client.optimize_from_options("test-agent", options) + assert result.scores["accuracy"].score == 0.2 + + async def test_on_passing_result_called_on_success(self): + on_passing = MagicMock() + handle_agent_call = AsyncMock(return_value="Great answer.") + handle_judge_call = AsyncMock(return_value=JUDGE_PASS_RESPONSE) + client = _make_client(self.mock_ldai) + options = _make_options( + handle_agent_call=handle_agent_call, + handle_judge_call=handle_judge_call, + ) + options.on_passing_result = on_passing + await client.optimize_from_options("test-agent", options) + on_passing.assert_called_once() + + async def test_on_failing_result_called_on_max_attempts(self): + on_failing = MagicMock() + handle_agent_call = AsyncMock(side_effect=[ + "Bad.", # iteration 1: agent + VARIATION_RESPONSE, # iteration 1: variation + "Still bad.", # iteration 2: agent + VARIATION_RESPONSE, # iteration 2: variation (before max-attempts guard) + ]) + handle_judge_call = AsyncMock(return_value=JUDGE_FAIL_RESPONSE) + client = _make_client(self.mock_ldai) + options = _make_options( + handle_agent_call=handle_agent_call, + handle_judge_call=handle_judge_call, + max_attempts=2, + ) + options.on_failing_result = on_failing + await client.optimize_from_options("test-agent", options) + on_failing.assert_called_once() + + async def test_on_turn_manual_path_success(self): + handle_agent_call = AsyncMock(return_value="Answer.") + handle_judge_call = AsyncMock(return_value=JUDGE_PASS_RESPONSE) + client = _make_client(self.mock_ldai) + options = OptimizationOptions( + context_choices=[LD_CONTEXT], + max_attempts=3, + model_choices=["gpt-4o"], + judge_model="gpt-4o", + variable_choices=[{}], + handle_agent_call=handle_agent_call, + handle_judge_call=handle_judge_call, + judges={"j": OptimizationJudge(threshold=0.8, acceptance_statement="x")}, + on_turn=lambda ctx: True, + ) + result = await client.optimize_from_options("test-agent", options) + assert result.completion_response == "Answer." + + async def test_status_update_callback_called_at_each_stage(self): + statuses = [] + handle_agent_call = AsyncMock(return_value="Good answer.") + handle_judge_call = AsyncMock(return_value=JUDGE_PASS_RESPONSE) + client = _make_client(self.mock_ldai) + options = _make_options( + handle_agent_call=handle_agent_call, + handle_judge_call=handle_judge_call, + ) + options.on_status_update = lambda status, ctx: statuses.append(status) + await client.optimize_from_options("test-agent", options) + assert "init" in statuses + assert "generating" in statuses + assert "evaluating" in statuses + assert "success" in statuses + + +# --------------------------------------------------------------------------- +# Variation prompt — acceptance criteria section +# --------------------------------------------------------------------------- + + +class TestVariationPromptAcceptanceCriteria: + def setup_method(self): + self.client = _make_client() + agent_config = _make_agent_config() + self.client._agent_key = "test-agent" + self.client._agent_config = agent_config + self.client._initial_instructions = AGENT_INSTRUCTIONS + self.client._initialize_class_members_from_config(agent_config) + + def _set_judges(self, judges): + self.client._options = _make_options(judges=judges) + + def test_includes_acceptance_statement_in_section(self): + self._set_judges({ + "quality": OptimizationJudge( + threshold=0.8, + acceptance_statement="Responses must be concise and factual.", + ) + }) + section = self.client._new_variation_prompt_acceptance_criteria() + assert "Responses must be concise and factual." in section + assert "quality" in section + + def test_labels_all_judges(self): + self._set_judges({ + "a": OptimizationJudge(threshold=0.8, acceptance_statement="Must be brief."), + "b": OptimizationJudge(threshold=0.9, acceptance_statement="Must cite sources."), + }) + section = self.client._new_variation_prompt_acceptance_criteria() + assert "[a]" in section + assert "[b]" in section + assert "Must be brief." in section + assert "Must cite sources." in section + + def test_returns_empty_string_when_no_acceptance_statements(self): + self._set_judges({ + "ld-judge": OptimizationJudge(threshold=0.8, judge_key="some-ld-key"), + }) + section = self.client._new_variation_prompt_acceptance_criteria() + assert section == "" + + def test_returns_empty_string_with_no_judges(self): + options = MagicMock() + options.judges = None + self.client._options = options + section = self.client._new_variation_prompt_acceptance_criteria() + assert section == "" + + def test_section_appears_in_full_prompt(self): + self._set_judges({ + "accuracy": OptimizationJudge( + threshold=0.8, + acceptance_statement="Facts only.", + ) + }) + prompt = self.client._build_new_variation_prompt([]) + assert "Facts only." in prompt + assert "ACCEPTANCE CRITERIA" in prompt From ea596a7ec84733b9198b475cfa81b743b61feb9b Mon Sep 17 00:00:00 2001 From: Andrew Klatzke Date: Mon, 30 Mar 2026 17:12:54 -0800 Subject: [PATCH 03/11] feat: implement ability to use completions or agents for judge calls --- .../src/ldai_optimization/__init__.py | 12 ++ .../src/ldai_optimization/client.py | 34 ++++-- .../src/ldai_optimization/dataclasses.py | 27 +++- packages/optimization/tests/test_client.py | 115 +++++++++++++++++- packages/optimization/tests/test_package.py | 7 +- 5 files changed, 176 insertions(+), 19 deletions(-) diff --git a/packages/optimization/src/ldai_optimization/__init__.py b/packages/optimization/src/ldai_optimization/__init__.py index 6319dee..3773a95 100644 --- a/packages/optimization/src/ldai_optimization/__init__.py +++ b/packages/optimization/src/ldai_optimization/__init__.py @@ -4,10 +4,22 @@ """ from ldai_optimization.client import OptimizationClient +from ldai_optimization.dataclasses import ( + AIJudgeCallConfig, + OptimizationContext, + OptimizationJudge, + OptimizationJudgeContext, + OptimizationOptions, +) __version__ = "0.0.0" __all__ = [ '__version__', + 'AIJudgeCallConfig', 'OptimizationClient', + 'OptimizationContext', + 'OptimizationJudge', + 'OptimizationJudgeContext', + 'OptimizationOptions', ] diff --git a/packages/optimization/src/ldai_optimization/client.py b/packages/optimization/src/ldai_optimization/client.py index 5674e99..aaab64a 100644 --- a/packages/optimization/src/ldai_optimization/client.py +++ b/packages/optimization/src/ldai_optimization/client.py @@ -8,10 +8,11 @@ import json from ldai import LDAIClient, AIJudgeConfig, AIJudgeConfigDefault, AIAgentConfig -from ldai.models import ModelConfig +from ldai.models import LDMessage, ModelConfig from ldclient import Context from ldai_optimization.dataclasses import ( + AIJudgeCallConfig, AutoCommitConfig, JudgeResult, OptimizationContext, @@ -456,7 +457,9 @@ async def _evaluate_config_judge( ) return JudgeResult(score=0.0, rationale=None) - # Collapse all system messages into a single instructions string; collect the user message + # Split messages into system and user turns. + # System turns are joined into a single instructions string (agents SDK path). + # All messages are forwarded as-is for the completions path. system_parts = [] user_parts = [] for msg in judge_config.messages: @@ -472,6 +475,13 @@ async def _evaluate_config_judge( instructions = "\n\n".join(system_parts) judge_user_input = "\n\n".join(user_parts) if user_parts else f"Here is the response to evaluate: {completion_response}" + # Rebuild the message list with the updated system content so completions users + # receive the same scoring instructions that are baked into `instructions`. + updated_messages: List[LDMessage] = [ + LDMessage(role="system", content=instructions), + LDMessage(role="user", content=judge_user_input), + ] + # Collect model parameters from the judge config, separating out any existing tools model_name = judge_config.model.name if judge_config.model else self._options.judge_model model_params: Dict[str, Any] = {} @@ -496,15 +506,14 @@ async def _evaluate_config_judge( # Add structured output tool for score and rationale tools.append(create_evaluation_tool()) - judge_agent_config = AIAgentConfig( + judge_call_config = AIJudgeCallConfig( key=judge_key, - enabled=True, model=ModelConfig( name=model_name, parameters={**model_params, "tools": [t.to_dict() for t in tools]}, ), instructions=instructions, - provider=self._agent_config.provider, + messages=updated_messages, ) judge_ctx = OptimizationJudgeContext( @@ -513,7 +522,7 @@ async def _evaluate_config_judge( ) result = self._options.handle_judge_call( - judge_key, judge_agent_config, judge_ctx, self._builtin_judge_tool_handlers() + judge_key, judge_call_config, judge_ctx, self._builtin_judge_tool_handlers() ) judge_response_str = await await_if_needed(result) @@ -602,23 +611,28 @@ async def _evaluate_acceptance_judge( # Prepend agent tools so the judge can invoke them for verification if needed tools: List[StructuredOutputTool] = list(resolved_agent_tools) + [create_evaluation_tool()] - judge_agent_config = AIAgentConfig( + judge_user_input = f"Here is the response to evaluate: {completion_response}" + + judge_call_config = AIJudgeCallConfig( key=judge_key, - enabled=True, model=ModelConfig( name=self._options.judge_model, parameters={"tools": [t.to_dict() for t in tools]}, ), instructions=instructions, + messages=[ + LDMessage(role="system", content=instructions), + LDMessage(role="user", content=judge_user_input), + ], ) judge_ctx = OptimizationJudgeContext( - user_input=f"Here is the response to evaluate: {completion_response}", + user_input=judge_user_input, variables=resolved_variables, ) result = self._options.handle_judge_call( - judge_key, judge_agent_config, judge_ctx, self._builtin_judge_tool_handlers() + judge_key, judge_call_config, judge_ctx, self._builtin_judge_tool_handlers() ) judge_response = await await_if_needed(result) diff --git a/packages/optimization/src/ldai_optimization/dataclasses.py b/packages/optimization/src/ldai_optimization/dataclasses.py index bda41ab..92fa075 100644 --- a/packages/optimization/src/ldai_optimization/dataclasses.py +++ b/packages/optimization/src/ldai_optimization/dataclasses.py @@ -16,6 +16,7 @@ ) from ldai import AIAgentConfig +from ldai.models import LDMessage, ModelConfig from ldclient import Context @@ -83,6 +84,28 @@ def from_dict(cls, data: Dict[str, Any]) -> "StructuredOutputTool": ) +@dataclass +class AIJudgeCallConfig: + """ + Configuration passed to ``handle_judge_call``. + + Carries everything needed to run a judge in either paradigm: + + * **Completions path** — pass ``messages`` directly to ``chat.completions.create``. + The full system + user turn sequence is already assembled and interpolated. + * **Agents path** — use ``instructions`` as the system prompt and + ``OptimizationJudgeContext.user_input`` as the ``Runner.run`` input. + + Both fields are always populated, regardless of whether the judge comes from a + LaunchDarkly flag (config judge) or an inline acceptance statement. + """ + + key: str + model: ModelConfig + instructions: str + messages: List[LDMessage] + + @dataclass class Message: """A message in a conversation.""" @@ -199,8 +222,8 @@ class OptimizationOptions: Callable[[str, AIAgentConfig, OptimizationContext, Dict[str, Callable[..., Any]]], Awaitable[str]], ] handle_judge_call: Union[ - Callable[[str, AIAgentConfig, OptimizationJudgeContext, Dict[str, Callable[..., Any]]], str], - Callable[[str, AIAgentConfig, OptimizationJudgeContext, Dict[str, Callable[..., Any]]], Awaitable[str]], + Callable[[str, AIJudgeCallConfig, OptimizationJudgeContext, Dict[str, Callable[..., Any]]], str], + Callable[[str, AIJudgeCallConfig, OptimizationJudgeContext, Dict[str, Callable[..., Any]]], Awaitable[str]], ] # Criteria for pass/fail - Optional user_input_options: Optional[List[str]] = ( diff --git a/packages/optimization/tests/test_client.py b/packages/optimization/tests/test_client.py index 34a7bb3..ae5ccb7 100644 --- a/packages/optimization/tests/test_client.py +++ b/packages/optimization/tests/test_client.py @@ -11,6 +11,7 @@ from ldai_optimization.client import OptimizationClient from ldai_optimization.dataclasses import ( + AIJudgeCallConfig, JudgeResult, OptimizationContext, OptimizationJudge, @@ -371,11 +372,58 @@ async def test_handle_judge_call_receives_correct_key_and_config(self): call_args = self.handle_judge_call.call_args key, config, ctx, handlers = call_args.args assert key == "relevance" - assert isinstance(config, AIAgentConfig) + assert isinstance(config, AIJudgeCallConfig) assert isinstance(ctx, OptimizationJudgeContext) - assert "relevance" in create_evaluation_tool().name or True # handlers present assert create_evaluation_tool().name in handlers + async def test_messages_has_system_and_user_turns(self): + judge = OptimizationJudge( + threshold=0.8, acceptance_statement="Must be factual." + ) + await self.client._evaluate_acceptance_judge( + judge_key="facts", + optimization_judge=judge, + completion_response="The sky is blue.", + iteration=1, + reasoning_history="", + user_input="What colour is the sky?", + ) + _, config, _, _ = self.handle_judge_call.call_args.args + roles = [m.role for m in config.messages] + assert roles == ["system", "user"] + + async def test_messages_system_content_matches_instructions(self): + judge = OptimizationJudge( + threshold=0.8, acceptance_statement="Be concise." + ) + await self.client._evaluate_acceptance_judge( + judge_key="brevity", + optimization_judge=judge, + completion_response="Yes.", + iteration=1, + reasoning_history="", + user_input="Is Paris in France?", + ) + _, config, _, _ = self.handle_judge_call.call_args.args + system_msg = next(m for m in config.messages if m.role == "system") + assert system_msg.content == config.instructions + + async def test_messages_user_content_matches_context_user_input(self): + judge = OptimizationJudge( + threshold=0.8, acceptance_statement="Answer directly." + ) + await self.client._evaluate_acceptance_judge( + judge_key="directness", + optimization_judge=judge, + completion_response="Paris.", + iteration=1, + reasoning_history="", + user_input="Capital of France?", + ) + _, config, ctx, _ = self.handle_judge_call.call_args.args + user_msg = next(m for m in config.messages if m.role == "user") + assert user_msg.content == ctx.user_input + async def test_acceptance_statement_in_instructions(self): statement = "Response must mention the Eiffel Tower." judge = OptimizationJudge(threshold=0.8, acceptance_statement=statement) @@ -498,7 +546,7 @@ def _make_judge_config(self, enabled: bool = True) -> AIJudgeConfig: ], ) - async def test_calls_handle_judge_call_with_collapsed_instructions(self): + async def test_calls_handle_judge_call_with_correct_config_type(self): self.mock_ldai.judge_config.return_value = self._make_judge_config() judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge( @@ -512,9 +560,70 @@ async def test_calls_handle_judge_call_with_collapsed_instructions(self): call_args = self.handle_judge_call.call_args key, config, ctx, handlers = call_args.args assert key == "quality" + assert isinstance(config, AIJudgeCallConfig) assert "You are an evaluator." in config.instructions assert isinstance(ctx, OptimizationJudgeContext) + async def test_messages_has_system_and_user_turns(self): + self.mock_ldai.judge_config.return_value = self._make_judge_config() + judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") + await self.client._evaluate_config_judge( + judge_key="quality", + optimization_judge=judge, + completion_response="Good answer.", + iteration=1, + reasoning_history="", + user_input="What is X?", + ) + _, config, _, _ = self.handle_judge_call.call_args.args + roles = [m.role for m in config.messages] + assert roles == ["system", "user"] + + async def test_messages_system_content_matches_instructions(self): + self.mock_ldai.judge_config.return_value = self._make_judge_config() + judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") + await self.client._evaluate_config_judge( + judge_key="quality", + optimization_judge=judge, + completion_response="Good answer.", + iteration=1, + reasoning_history="", + user_input="What is X?", + ) + _, config, _, _ = self.handle_judge_call.call_args.args + system_msg = next(m for m in config.messages if m.role == "system") + assert system_msg.content == config.instructions + + async def test_messages_user_content_matches_context_user_input(self): + self.mock_ldai.judge_config.return_value = self._make_judge_config() + judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") + await self.client._evaluate_config_judge( + judge_key="quality", + optimization_judge=judge, + completion_response="Good answer.", + iteration=1, + reasoning_history="", + user_input="What is X?", + ) + _, config, ctx, _ = self.handle_judge_call.call_args.args + user_msg = next(m for m in config.messages if m.role == "user") + assert user_msg.content == ctx.user_input + + async def test_messages_user_content_contains_ld_user_message(self): + self.mock_ldai.judge_config.return_value = self._make_judge_config() + judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") + await self.client._evaluate_config_judge( + judge_key="quality", + optimization_judge=judge, + completion_response="Good answer.", + iteration=1, + reasoning_history="", + user_input="What is X?", + ) + _, config, _, _ = self.handle_judge_call.call_args.args + user_msg = next(m for m in config.messages if m.role == "user") + assert "Evaluate this response." in user_msg.content + async def test_returns_zero_score_when_judge_disabled(self): self.mock_ldai.judge_config.return_value = self._make_judge_config(enabled=False) judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") diff --git a/packages/optimization/tests/test_package.py b/packages/optimization/tests/test_package.py index a8356a8..4e6d6c0 100644 --- a/packages/optimization/tests/test_package.py +++ b/packages/optimization/tests/test_package.py @@ -10,7 +10,6 @@ def test_version_is_string(): assert len(__version__) > 0 -def test_optimize_not_implemented(): - client = OptimizationClient() - with pytest.raises(NotImplementedError): - client.optimize("example", {}) +def test_client_requires_ldai_client(): + with pytest.raises(TypeError): + OptimizationClient() # type: ignore[call-arg] From 2fd55e2b02e4f7b2a9b772655409d5eaadba3593 Mon Sep 17 00:00:00 2001 From: Andrew Klatzke Date: Mon, 30 Mar 2026 17:21:48 -0800 Subject: [PATCH 04/11] feat: all logs -> debug --- .../optimization/src/ldai_optimization/client.py | 16 ++++++++-------- .../optimization/src/ldai_optimization/util.py | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/packages/optimization/src/ldai_optimization/client.py b/packages/optimization/src/ldai_optimization/client.py index aaab64a..7139a82 100644 --- a/packages/optimization/src/ldai_optimization/client.py +++ b/packages/optimization/src/ldai_optimization/client.py @@ -372,7 +372,7 @@ async def _call_judges( threshold = optimization_judge.threshold if optimization_judge.threshold is not None else 1.0 passed = result.score >= threshold - logger.info( + logger.debug( "[Iteration %d] -> Judge '%s' scored %.3f (threshold=%.3f) -> %s%s", iteration, judge_key, @@ -388,7 +388,7 @@ async def _call_judges( judge_results[judge_key] = JudgeResult(score=0.0, rationale=None) judge_results_json = self._serialize_scores(judge_results) - logger.info( + logger.debug( "[Iteration %d] -> Evaluation result: %s", iteration, json.dumps(judge_results_json, indent=2), @@ -526,7 +526,7 @@ async def _evaluate_config_judge( ) judge_response_str = await await_if_needed(result) - logger.info( + logger.debug( "[Iteration %d] -> Judge response (%s): %s", iteration, judge_key, @@ -636,7 +636,7 @@ async def _evaluate_acceptance_judge( ) judge_response = await await_if_needed(result) - logger.info( + logger.debug( "[Iteration %d] -> Judge response (%s): %s", iteration, judge_key, judge_response ) @@ -1011,7 +1011,7 @@ def _apply_new_variation_response( missing_fields.append("model") if missing_fields: - logger.error( + logger.debug( "[Iteration %d] -> Response missing required fields: %s. Received fields: %s. Full response_data: %s", iteration, ", ".join(missing_fields), @@ -1056,7 +1056,7 @@ def _apply_new_variation_response( self._current_model, ) - logger.info( + logger.debug( "[Iteration %d] -> New variation generated: instructions='%s', model=%s, parameters=%s", iteration, self._current_instructions, @@ -1184,7 +1184,7 @@ async def _execute_agent_turn( self._builtin_agent_tool_handlers(is_variation=False), ) completion_response = await await_if_needed(result) - logger.info( + logger.debug( "[Iteration %d] -> Agent response: %.300s%s", iteration, completion_response, @@ -1322,7 +1322,7 @@ async def _run_optimization( if self._options.user_input_options: user_input = random.choice(self._options.user_input_options) if user_input: - logger.info("[Iteration %d] -> User input: %s", iteration, user_input) + logger.debug("[Iteration %d] -> User input: %s", iteration, user_input) optimize_context = self._create_optimization_context( iteration=iteration, diff --git a/packages/optimization/src/ldai_optimization/util.py b/packages/optimization/src/ldai_optimization/util.py index b64a1b5..40910e8 100644 --- a/packages/optimization/src/ldai_optimization/util.py +++ b/packages/optimization/src/ldai_optimization/util.py @@ -241,7 +241,7 @@ def extract_json_from_response(response_str: str) -> Dict[str, Any]: try: response_data = json.loads(json_match.group()) except json.JSONDecodeError: - logger.error( + logger.debug( "Extracted JSON string failed to parse: %s", json_match.group()[:200], ) @@ -250,7 +250,7 @@ def extract_json_from_response(response_str: str) -> Dict[str, Any]: ) if response_data is None: - logger.error( + logger.debug( "Failed to extract JSON from response. " "Response length: %d, response: %s", len(response_str), From 8481690903e931b67c5ae5ca5fbadd10a6b4102a Mon Sep 17 00:00:00 2001 From: Andrew Klatzke Date: Mon, 30 Mar 2026 17:43:33 -0800 Subject: [PATCH 05/11] fix: lints + structured output tool rename --- .../src/ldai_optimization/__init__.py | 2 + .../src/ldai_optimization/client.py | 54 ++++++++++--------- .../src/ldai_optimization/dataclasses.py | 8 +-- .../src/ldai_optimization/util.py | 20 +++---- packages/optimization/tests/test_client.py | 20 ++++--- 5 files changed, 55 insertions(+), 49 deletions(-) diff --git a/packages/optimization/src/ldai_optimization/__init__.py b/packages/optimization/src/ldai_optimization/__init__.py index 3773a95..a0fc60a 100644 --- a/packages/optimization/src/ldai_optimization/__init__.py +++ b/packages/optimization/src/ldai_optimization/__init__.py @@ -10,6 +10,7 @@ OptimizationJudge, OptimizationJudgeContext, OptimizationOptions, + ToolDefinition, ) __version__ = "0.0.0" @@ -22,4 +23,5 @@ 'OptimizationJudge', 'OptimizationJudgeContext', 'OptimizationOptions', + 'ToolDefinition', ] diff --git a/packages/optimization/src/ldai_optimization/client.py b/packages/optimization/src/ldai_optimization/client.py index 7139a82..7e186aa 100644 --- a/packages/optimization/src/ldai_optimization/client.py +++ b/packages/optimization/src/ldai_optimization/client.py @@ -1,6 +1,6 @@ """Client for LaunchDarkly AI agent optimization.""" -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Literal, Optional import dataclasses import os import logging @@ -19,7 +19,7 @@ OptimizationJudge, OptimizationJudgeContext, OptimizationOptions, - StructuredOutputTool, + ToolDefinition, ) from ldai_optimization.util import ( await_if_needed, @@ -60,7 +60,9 @@ def _initialize_class_members_from_config( self, agent_config: AIAgentConfig ) -> None: self._current_instructions = agent_config.instructions or "" - self._current_parameters: Dict[str, Any] = agent_config.model._parameters or {} + self._current_parameters: Dict[str, Any] = ( + agent_config.model._parameters if agent_config.model else None + ) or {} self._current_model: Optional[str] = ( agent_config.model.name if agent_config.model else None ) @@ -128,7 +130,10 @@ def _create_optimization_context( ) def _safe_status_update( - self, status: str, context: OptimizationContext, iteration: int + self, + status: Literal["init", "generating", "evaluating", "generating variation", "turn completed", "success", "failure"], + context: OptimizationContext, + iteration: int, ) -> None: """ Safely call on_status_update callback, catching and logging errors. @@ -215,10 +220,10 @@ def _extract_agent_tools(self, parameters: Dict[str, Any]) -> List[StructuredOut Extract and normalise the tools list from agent parameters. Reads the ``tools`` key from *parameters* (if present) and converts - every entry to a StructuredOutputTool so judges receive typed objects. + every entry to a ToolDefinition so judges receive typed objects. :param parameters: The agent's current_parameters dict - :return: List of StructuredOutputTool instances, empty list if no tools are configured + :return: List of ToolDefinition instances, empty list if no tools are configured """ raw_tools = parameters.get("tools", []) if not raw_tools: @@ -228,12 +233,12 @@ def _extract_agent_tools(self, parameters: Dict[str, Any]) -> List[StructuredOut result = [] for tool in raw_tools: - if isinstance(tool, StructuredOutputTool): + if isinstance(tool, ToolDefinition): result.append(tool) elif hasattr(tool, "to_dict"): - result.append(StructuredOutputTool.from_dict(tool.to_dict())) + result.append(ToolDefinition.from_dict(tool.to_dict())) elif isinstance(tool, dict): - result.append(StructuredOutputTool.from_dict(tool)) + result.append(ToolDefinition.from_dict(tool)) return result def _parse_judge_response( @@ -310,7 +315,7 @@ async def _call_judges( iteration: int, user_input: str, variables: Optional[Dict[str, Any]] = None, - agent_tools: Optional[List[StructuredOutputTool]] = None, + agent_tools: Optional[List[ToolDefinition]] = None, ) -> Dict[str, JudgeResult]: """ Call all judges in parallel (auto-path). @@ -331,7 +336,7 @@ async def _call_judges( return {} resolved_variables: Dict[str, Any] = variables or {} - resolved_agent_tools: List[Dict[str, Any]] = agent_tools or [] + resolved_agent_tools: List[ToolDefinition] = agent_tools or [] logger.info("[Iteration %d] -> Executing evaluation...", iteration) reasoning_history = self._build_reasoning_history() @@ -404,7 +409,7 @@ async def _evaluate_config_judge( reasoning_history: str, user_input: str, variables: Optional[Dict[str, Any]] = None, - agent_tools: Optional[List[StructuredOutputTool]] = None, + agent_tools: Optional[List[ToolDefinition]] = None, ) -> JudgeResult: """ Evaluate using a config-type judge (with judge_key). @@ -434,6 +439,7 @@ async def _evaluate_config_judge( "response_to_evaluate": completion_response, } + assert optimization_judge.judge_key is not None judge_config = self._judge_config( optimization_judge.judge_key, self._options.context_choices[0], @@ -485,18 +491,18 @@ async def _evaluate_config_judge( # Collect model parameters from the judge config, separating out any existing tools model_name = judge_config.model.name if judge_config.model else self._options.judge_model model_params: Dict[str, Any] = {} - tools: List[StructuredOutputTool] = [] + tools: List[ToolDefinition] = [] if judge_config.model and judge_config.model._parameters: existing_tools = judge_config.model._parameters.get("tools") if existing_tools: raw = existing_tools if isinstance(existing_tools, list) else [existing_tools] for t in raw: - if isinstance(t, StructuredOutputTool): + if isinstance(t, ToolDefinition): tools.append(t) elif hasattr(t, "to_dict"): - tools.append(StructuredOutputTool.from_dict(t.to_dict())) + tools.append(ToolDefinition.from_dict(t.to_dict())) elif isinstance(t, dict): - tools.append(StructuredOutputTool.from_dict(t)) + tools.append(ToolDefinition.from_dict(t)) model_params = {k: v for k, v in judge_config.model._parameters.items() if k != "tools"} # Prepend agent tools so the judge can call them when verifying the response @@ -552,7 +558,7 @@ async def _evaluate_acceptance_judge( reasoning_history: str, user_input: str, variables: Optional[Dict[str, Any]] = None, - agent_tools: Optional[List[StructuredOutputTool]] = None, + agent_tools: Optional[List[ToolDefinition]] = None, ) -> JudgeResult: """ Evaluate using an acceptance statement judge. @@ -609,7 +615,7 @@ async def _evaluate_acceptance_judge( ) # Prepend agent tools so the judge can invoke them for verification if needed - tools: List[StructuredOutputTool] = list(resolved_agent_tools) + [create_evaluation_tool()] + tools: List[ToolDefinition] = list(resolved_agent_tools) + [create_evaluation_tool()] judge_user_input = f"Here is the response to evaluate: {completion_response}" @@ -889,7 +895,7 @@ def _new_variation_prompt_improvement_instructions( "\nSTART:" "\n" + self._initial_instructions + "\n", "\nEND OF ORIGINAL INSTRUCTIONS\n", - "The following prompt variables are available and are the only variables that should be used: {placeholder_list}" + f"The following prompt variables are available and are the only variables that should be used: {placeholder_list}" "Here is an example of a good response if an {{id}} placeholder is available: 'Select records matching id {{id}}'", "Here is an example of a bad response if an {{id}} placeholder is available: 'Select records matching id 1232'", "Here is an example of a good response if a {{resource_id}} and {{resource_type}} placeholder are available: 'Select records matching id {{resource_id}} and type {{resource_type}}'", @@ -1203,7 +1209,7 @@ async def _execute_agent_turn( scores = await self._call_judges( completion_response, iteration, - user_input=optimize_context.user_input, + user_input=optimize_context.user_input or "", variables=optimize_context.current_variables, agent_tools=agent_tools, ) @@ -1358,10 +1364,10 @@ async def _run_optimization( logger.exception( "[Iteration %d] -> on_turn evaluation failed", iteration ) - self._history.append(optimize_context) - await self._generate_new_variation(iteration, optimize_context.current_variables) if iteration >= self._options.max_attempts: return self._handle_failure(optimize_context, iteration) + self._history.append(optimize_context) + await self._generate_new_variation(iteration, optimize_context.current_variables) self._safe_status_update( "turn completed", optimize_context, iteration ) @@ -1377,10 +1383,10 @@ async def _run_optimization( "[Iteration %d] -> One or more judges failed (attempt %d/%d) — generating new variation", iteration, iteration, self._options.max_attempts, ) - self._history.append(optimize_context) - await self._generate_new_variation(iteration, optimize_context.current_variables) if iteration >= self._options.max_attempts: return self._handle_failure(optimize_context, iteration) + self._history.append(optimize_context) + await self._generate_new_variation(iteration, optimize_context.current_variables) self._safe_status_update( "turn completed", optimize_context, iteration ) diff --git a/packages/optimization/src/ldai_optimization/dataclasses.py b/packages/optimization/src/ldai_optimization/dataclasses.py index 92fa075..b8f6e93 100644 --- a/packages/optimization/src/ldai_optimization/dataclasses.py +++ b/packages/optimization/src/ldai_optimization/dataclasses.py @@ -40,7 +40,7 @@ def to_json(self) -> Dict[str, Any]: @dataclass -class StructuredOutputTool: +class ToolDefinition: """ Generic tool definition for enforcing structured output from LLM responses. @@ -68,13 +68,13 @@ def to_dict(self) -> Dict[str, Any]: } @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "StructuredOutputTool": + def from_dict(cls, data: Dict[str, Any]) -> "ToolDefinition": """ - Construct a StructuredOutputTool from a plain dictionary. + Construct a ToolDefinition from a plain dictionary. :param data: Dictionary with at least a ``name`` key; ``description`` and ``input_schema`` default to empty values when absent. - :return: A new StructuredOutputTool instance + :return: A new ToolDefinition instance """ return cls( name=data.get("name", ""), diff --git a/packages/optimization/src/ldai_optimization/util.py b/packages/optimization/src/ldai_optimization/util.py index 40910e8..00e2df4 100644 --- a/packages/optimization/src/ldai_optimization/util.py +++ b/packages/optimization/src/ldai_optimization/util.py @@ -6,7 +6,7 @@ import re from typing import Any, Awaitable, Dict, List, Optional, Union -from ldai_optimization.dataclasses import StructuredOutputTool +from ldai_optimization.dataclasses import ToolDefinition logger = logging.getLogger(__name__) @@ -86,13 +86,13 @@ async def await_if_needed( return result -def create_evaluation_tool() -> StructuredOutputTool: +def create_evaluation_tool() -> ToolDefinition: """ Create the structured output tool for judge evaluations. - :return: A StructuredOutputTool for evaluation responses + :return: A ToolDefinition for evaluation responses """ - return StructuredOutputTool( + return ToolDefinition( type="function", name="return_evaluation", description="Returns an evaluation with a score and rationale.", @@ -113,13 +113,13 @@ def create_evaluation_tool() -> StructuredOutputTool: ) -def create_boolean_tool() -> StructuredOutputTool: +def create_boolean_tool() -> ToolDefinition: """ Create the structured output tool for acceptance judges. - :return: A StructuredOutputTool for boolean evaluation responses + :return: A ToolDefinition for boolean evaluation responses """ - return StructuredOutputTool( + return ToolDefinition( type="function", name="return_boolean", description="Returns a boolean value and reasoning for the evaluation.", @@ -140,14 +140,14 @@ def create_boolean_tool() -> StructuredOutputTool: ) -def create_variation_tool(model_choices: List[str]) -> StructuredOutputTool: +def create_variation_tool(model_choices: List[str]) -> ToolDefinition: """ Create the structured output tool for variation generation. :param model_choices: List of model IDs the LLM may select from - :return: A StructuredOutputTool for variation generation responses + :return: A ToolDefinition for variation generation responses """ - return StructuredOutputTool( + return ToolDefinition( type="function", name="return_improved_configuration", description=( diff --git a/packages/optimization/tests/test_client.py b/packages/optimization/tests/test_client.py index ae5ccb7..b580524 100644 --- a/packages/optimization/tests/test_client.py +++ b/packages/optimization/tests/test_client.py @@ -17,7 +17,7 @@ OptimizationJudge, OptimizationJudgeContext, OptimizationOptions, - StructuredOutputTool, + ToolDefinition, ) from ldai_optimization.util import ( create_evaluation_tool, @@ -172,11 +172,11 @@ def test_returns_structured_output_tool_from_dict(self): } result = self.client._extract_agent_tools({"tools": [tool_dict]}) assert len(result) == 1 - assert isinstance(result[0], StructuredOutputTool) + assert isinstance(result[0], ToolDefinition) assert result[0].name == "lookup" def test_passes_through_existing_structured_output_tool(self): - tool = StructuredOutputTool( + tool = ToolDefinition( name="my-tool", description="desc", input_schema={} ) result = self.client._extract_agent_tools({"tools": [tool]}) @@ -456,7 +456,7 @@ async def test_evaluation_tool_in_config_parameters(self): assert create_evaluation_tool().name in tool_names async def test_agent_tools_prepended_to_config_tools(self): - agent_tool = StructuredOutputTool( + agent_tool = ToolDefinition( name="lookup", description="Lookup data", input_schema={} ) judge = OptimizationJudge(threshold=0.8, acceptance_statement="Use tool.") @@ -679,7 +679,7 @@ async def test_template_variables_merged_into_judge_config_call(self): async def test_agent_tools_prepended_before_evaluation_tool(self): self.mock_ldai.judge_config.return_value = self._make_judge_config() - agent_tool = StructuredOutputTool(name="search", description="Search", input_schema={}) + agent_tool = ToolDefinition(name="search", description="Search", input_schema={}) judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge( judge_key="quality", @@ -858,15 +858,14 @@ async def test_generates_variation_when_judge_fails(self): assert handle_agent_call.call_count == 3 # 1 agent + 1 variation + 1 agent async def test_returns_last_context_after_max_attempts(self): - # The loop always calls _generate_new_variation before the max_attempts - # guard, so each of the 3 failing iterations produces a variation call. + # The max_attempts guard fires before variation on the final iteration, + # so only iterations 1 and 2 produce a variation call. handle_agent_call = AsyncMock(side_effect=[ "Bad answer.", # iteration 1: agent VARIATION_RESPONSE, # iteration 1: variation "Still bad.", # iteration 2: agent VARIATION_RESPONSE, # iteration 2: variation - "Still bad.", # iteration 3: agent - VARIATION_RESPONSE, # iteration 3: variation (before max-attempts guard) + "Still bad.", # iteration 3: agent (max_attempts reached — no variation) ]) handle_judge_call = AsyncMock(return_value=JUDGE_FAIL_RESPONSE) client = _make_client(self.mock_ldai) @@ -896,8 +895,7 @@ async def test_on_failing_result_called_on_max_attempts(self): handle_agent_call = AsyncMock(side_effect=[ "Bad.", # iteration 1: agent VARIATION_RESPONSE, # iteration 1: variation - "Still bad.", # iteration 2: agent - VARIATION_RESPONSE, # iteration 2: variation (before max-attempts guard) + "Still bad.", # iteration 2: agent (max_attempts reached — no variation) ]) handle_judge_call = AsyncMock(return_value=JUDGE_FAIL_RESPONSE) client = _make_client(self.mock_ldai) From f8e55092fa9b06cd35b90fcd2979a2c84b042e7a Mon Sep 17 00:00:00 2001 From: Andrew Klatzke Date: Mon, 30 Mar 2026 17:46:33 -0800 Subject: [PATCH 06/11] fix: lint + missed variable rename --- packages/optimization/src/ldai_optimization/client.py | 2 +- packages/optimization/src/ldai_optimization/util.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/packages/optimization/src/ldai_optimization/client.py b/packages/optimization/src/ldai_optimization/client.py index 7e186aa..c8747fd 100644 --- a/packages/optimization/src/ldai_optimization/client.py +++ b/packages/optimization/src/ldai_optimization/client.py @@ -215,7 +215,7 @@ def _serialize_scores( """ return {key: result.to_json() for key, result in judge_results.items()} - def _extract_agent_tools(self, parameters: Dict[str, Any]) -> List[StructuredOutputTool]: + def _extract_agent_tools(self, parameters: Dict[str, Any]) -> List[ToolDefinition]: """ Extract and normalise the tools list from agent parameters. diff --git a/packages/optimization/src/ldai_optimization/util.py b/packages/optimization/src/ldai_optimization/util.py index 00e2df4..7996182 100644 --- a/packages/optimization/src/ldai_optimization/util.py +++ b/packages/optimization/src/ldai_optimization/util.py @@ -80,10 +80,9 @@ async def await_if_needed( :param result: Either a string or an awaitable that returns a string :return: The string result """ - if inspect.iscoroutine(result): - return await result - else: + if isinstance(result, str): return result + return await result def create_evaluation_tool() -> ToolDefinition: From c032aafd7ebd9384819498aab80fcf56f50eea65 Mon Sep 17 00:00:00 2001 From: Andrew Klatzke Date: Tue, 31 Mar 2026 08:42:21 -0800 Subject: [PATCH 07/11] fix: sort imports --- packages/optimization/src/ldai_optimization/client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/optimization/src/ldai_optimization/client.py b/packages/optimization/src/ldai_optimization/client.py index c8747fd..d263b60 100644 --- a/packages/optimization/src/ldai_optimization/client.py +++ b/packages/optimization/src/ldai_optimization/client.py @@ -1,13 +1,13 @@ """Client for LaunchDarkly AI agent optimization.""" -from typing import Any, Dict, List, Literal, Optional import dataclasses -import os +import json import logging +import os import random -import json +from typing import Any, Dict, List, Literal, Optional -from ldai import LDAIClient, AIJudgeConfig, AIJudgeConfigDefault, AIAgentConfig +from ldai import AIAgentConfig, AIJudgeConfig, AIJudgeConfigDefault, LDAIClient from ldai.models import LDMessage, ModelConfig from ldclient import Context From aee6aa744468e0be20d2874dfe5701a3583a8050 Mon Sep 17 00:00:00 2001 From: Andrew Klatzke Date: Tue, 31 Mar 2026 08:47:10 -0800 Subject: [PATCH 08/11] fix: lint --- .../src/ldai_optimization/client.py | 377 +++++++++++------- 1 file changed, 227 insertions(+), 150 deletions(-) diff --git a/packages/optimization/src/ldai_optimization/client.py b/packages/optimization/src/ldai_optimization/client.py index d263b60..0f97d63 100644 --- a/packages/optimization/src/ldai_optimization/client.py +++ b/packages/optimization/src/ldai_optimization/client.py @@ -131,7 +131,15 @@ def _create_optimization_context( def _safe_status_update( self, - status: Literal["init", "generating", "evaluating", "generating variation", "turn completed", "success", "failure"], + status: Literal[ + "init", + "generating", + "evaluating", + "generating variation", + "turn completed", + "success", + "failure", + ], context: OptimizationContext, iteration: int, ) -> None: @@ -305,7 +313,9 @@ def _builtin_agent_tool_handlers(self, is_variation: bool) -> Dict[str, Any]: """ if is_variation: return { - create_variation_tool(self._options.model_choices).name: handle_variation_tool_call, + create_variation_tool( + self._options.model_choices + ).name: handle_variation_tool_call, } return {} @@ -343,11 +353,19 @@ async def _call_judges( judge_results: Dict[str, JudgeResult] = {} judge_count = len(self._options.judges) - for idx, (judge_key, optimization_judge) in enumerate(self._options.judges.items(), 1): - judge_type = "config" if optimization_judge.judge_key is not None else "acceptance" + for idx, (judge_key, optimization_judge) in enumerate( + self._options.judges.items(), 1 + ): + judge_type = ( + "config" if optimization_judge.judge_key is not None else "acceptance" + ) logger.info( "[Iteration %d] -> Running judge %d/%d '%s' (%s)...", - iteration, idx, judge_count, judge_key, judge_type, + iteration, + idx, + judge_count, + judge_key, + judge_type, ) try: if optimization_judge.judge_key is not None: @@ -375,7 +393,11 @@ async def _call_judges( ) judge_results[judge_key] = result - threshold = optimization_judge.threshold if optimization_judge.threshold is not None else 1.0 + threshold = ( + optimization_judge.threshold + if optimization_judge.threshold is not None + else 1.0 + ) passed = result.score >= threshold logger.debug( "[Iteration %d] -> Judge '%s' scored %.3f (threshold=%.3f) -> %s%s", @@ -479,7 +501,11 @@ async def _evaluate_config_judge( user_parts.append(msg.content) instructions = "\n\n".join(system_parts) - judge_user_input = "\n\n".join(user_parts) if user_parts else f"Here is the response to evaluate: {completion_response}" + judge_user_input = ( + "\n\n".join(user_parts) + if user_parts + else f"Here is the response to evaluate: {completion_response}" + ) # Rebuild the message list with the updated system content so completions users # receive the same scoring instructions that are baked into `instructions`. @@ -489,13 +515,19 @@ async def _evaluate_config_judge( ] # Collect model parameters from the judge config, separating out any existing tools - model_name = judge_config.model.name if judge_config.model else self._options.judge_model + model_name = ( + judge_config.model.name if judge_config.model else self._options.judge_model + ) model_params: Dict[str, Any] = {} tools: List[ToolDefinition] = [] if judge_config.model and judge_config.model._parameters: existing_tools = judge_config.model._parameters.get("tools") if existing_tools: - raw = existing_tools if isinstance(existing_tools, list) else [existing_tools] + raw = ( + existing_tools + if isinstance(existing_tools, list) + else [existing_tools] + ) for t in raw: if isinstance(t, ToolDefinition): tools.append(t) @@ -503,7 +535,9 @@ async def _evaluate_config_judge( tools.append(ToolDefinition.from_dict(t.to_dict())) elif isinstance(t, dict): tools.append(ToolDefinition.from_dict(t)) - model_params = {k: v for k, v in judge_config.model._parameters.items() if k != "tools"} + model_params = { + k: v for k, v in judge_config.model._parameters.items() if k != "tools" + } # Prepend agent tools so the judge can call them when verifying the response if agent_tools: @@ -615,7 +649,9 @@ async def _evaluate_acceptance_judge( ) # Prepend agent tools so the judge can invoke them for verification if needed - tools: List[ToolDefinition] = list(resolved_agent_tools) + [create_evaluation_tool()] + tools: List[ToolDefinition] = list(resolved_agent_tools) + [ + create_evaluation_tool() + ] judge_user_input = f"Here is the response to evaluate: {completion_response}" @@ -643,7 +679,10 @@ async def _evaluate_acceptance_judge( judge_response = await await_if_needed(result) logger.debug( - "[Iteration %d] -> Judge response (%s): %s", iteration, judge_key, judge_response + "[Iteration %d] -> Judge response (%s): %s", + iteration, + judge_key, + judge_response, ) # Parse judge response — expect structured JSON output with score and rationale @@ -651,7 +690,9 @@ async def _evaluate_acceptance_judge( judge_response, judge_key, judge_key, iteration, clamp_score=True ) - async def _get_agent_config(self, agent_key: str, context: Context) -> AIAgentConfig: + async def _get_agent_config( + self, agent_key: str, context: Context + ) -> AIAgentConfig: """ Fetch the agent configuration, replacing the instructions with the raw variation template so that {{placeholder}} tokens are preserved for client-side interpolation. @@ -670,10 +711,14 @@ async def _get_agent_config(self, agent_key: str, context: Context) -> AIAgentCo # variation() returns the raw JSON before chevron.render(), so instructions # still contain {{placeholder}} tokens rather than empty strings. raw_variation = self._ldClient._client.variation(agent_key, context, {}) - raw_instructions = raw_variation.get("instructions", agent_config.instructions) + raw_instructions = raw_variation.get( + "instructions", agent_config.instructions + ) self._initial_instructions = raw_instructions - agent_config = dataclasses.replace(agent_config, instructions=raw_instructions) + agent_config = dataclasses.replace( + agent_config, instructions=raw_instructions + ) self._initialize_class_members_from_config(agent_config) return agent_config except Exception: @@ -715,9 +760,7 @@ def _build_reasoning_history(self) -> str: return "\n".join(reasoning_parts) - def _build_new_variation_prompt( - self, history: List[OptimizationContext] - ) -> str: + def _build_new_variation_prompt(self, history: List[OptimizationContext]) -> str: """ Build the LLM prompt for generating an improved agent configuration. @@ -742,16 +785,18 @@ def _build_new_variation_prompt( def _new_variation_prompt_preamble(self) -> str: """Static opening section for the variation generation prompt.""" - return "\n".join([ - "You are an assistant that helps improve agent configurations through iterative optimization.", - "", - "Your task is to generate improved agent instructions and parameters based on the feedback provided.", - "The feedback you provide should guide the LLM To improve the agent instructions for all possible use cases, not one concrete case.", - "For example, if the feedback is that the agent is not returning the correct records, you should improve the agent instructions to return the correct records for all possible use cases. Not just the one concrete case that was provided in the feedback.", - "When changing the instructions, keep the original intent in mind when it comes to things like the use of variables and placeholders.", - "If the original instructions were to use a placeholder like {{id}}, you should keep the placeholder in the new instructions, not replace it with the actual value. This is the case for all parameterized values (all parameters should appear in each new variation).", - "Pay particular attention to the instructions regarding tools and the rules for variables." - ]) + return "\n".join( + [ + "You are an assistant that helps improve agent configurations through iterative optimization.", + "", + "Your task is to generate improved agent instructions and parameters based on the feedback provided.", + "The feedback you provide should guide the LLM To improve the agent instructions for all possible use cases, not one concrete case.", + "For example, if the feedback is that the agent is not returning the correct records, you should improve the agent instructions to return the correct records for all possible use cases. Not just the one concrete case that was provided in the feedback.", + "When changing the instructions, keep the original intent in mind when it comes to things like the use of variables and placeholders.", + "If the original instructions were to use a placeholder like {{id}}, you should keep the placeholder in the new instructions, not replace it with the actual value. This is the case for all parameterized values (all parameters should appear in each new variation).", + "Pay particular attention to the instructions regarding tools and the rules for variables.", + ] + ) def _new_variation_prompt_acceptance_criteria(self) -> str: """ @@ -816,16 +861,16 @@ def _new_variation_prompt_configuration( lines.append(f"Agent response: {previous_ctx.completion_response}") return "\n".join(lines) else: - return "\n".join([ - "## Current Configuration:", - f"Model: {self._current_model}", - f"Instructions: {self._current_instructions}", - f"Parameters: {self._current_parameters}", - ]) - - def _new_variation_prompt_feedback( - self, history: List[OptimizationContext] - ) -> str: + return "\n".join( + [ + "## Current Configuration:", + f"Model: {self._current_model}", + f"Instructions: {self._current_instructions}", + f"Parameters: {self._current_parameters}", + ] + ) + + def _new_variation_prompt_feedback(self, history: List[OptimizationContext]) -> str: """ Evaluation feedback section of the variation prompt. @@ -874,11 +919,13 @@ def _new_variation_prompt_improvement_instructions( output format schema. When history is non-empty, adds feedback-driven improvement directives. """ - model_instructions = "\n".join([ - "You may also choose to change the model if you believe that the current model is not performing well or a different model would be better suited for the task. " - f"Here are the models you may choose from: {self._options.model_choices}. You must always return a model property, even if it's the same as the current model.", - "When suggesting a new model, you should provide a rationale for why you believe the new model would be better suited for the task.", - ]) + model_instructions = "\n".join( + [ + "You may also choose to change the model if you believe that the current model is not performing well or a different model would be better suited for the task. " + f"Here are the models you may choose from: {self._options.model_choices}. You must always return a model property, even if it's the same as the current model.", + "When suggesting a new model, you should provide a rationale for why you believe the new model would be better suited for the task.", + ] + ) # Collect unique variable keys across all variable_choices entries variable_keys: set = set() @@ -886,107 +933,116 @@ def _new_variation_prompt_improvement_instructions( variable_keys.update(choice.keys()) placeholder_list = ", ".join(f"{{{{{k}}}}}" for k in sorted(variable_keys)) - variable_instructions = "\n".join([ - "## Prompt Variables:", - "These variables are substituted into the instructions at call time using {{variable_name}} syntax.", - "Rules:", - "- If the {{variable_name}} placeholder is not present in the current instructions, you should include it where logically appropriate.", - "Here are the original instructions so that you can see how the placeholders are used and which are available:", - "\nSTART:" - "\n" + self._initial_instructions + "\n", - "\nEND OF ORIGINAL INSTRUCTIONS\n", - f"The following prompt variables are available and are the only variables that should be used: {placeholder_list}" - "Here is an example of a good response if an {{id}} placeholder is available: 'Select records matching id {{id}}'", - "Here is an example of a bad response if an {{id}} placeholder is available: 'Select records matching id 1232'", - "Here is an example of a good response if a {{resource_id}} and {{resource_type}} placeholder are available: 'Select records matching id {{resource_id}} and type {{resource_type}}'", - "Here is an example of a bad response if a {{resource_id}} and {{resource_type}} placeholder are available: 'Select records matching id 1232 and type {{resource_type}}'", - "Here is another example of a bad response if a {{resource_id}} and {{resource_type}} placeholder are available: 'Select records matching id {{resource_id}} and type resource-123'", - ]) - - tool_instructions = "\n".join([ - "## Tool Format:", - "If the current configuration includes tools, you MUST return them unchanged in current_parameters[\"tools\"].", - "Do NOT include internal framework tools such as the evaluation tool or structured output tool.", - "Each tool must follow this exact format:", - "{", - ' "name": "tool-name",', - ' "type": "function",', - ' "description": "What the tool does",', - ' "parameters": {', - ' "type": "object",', - ' "properties": {', - ' "param_name": {', - ' "type": "type of the input parameter",', - ' "description": "Description of the parameter"', - " }", - " },", - ' "required": ["param_name"],', - ' "additionalProperties": false', - " }", - "}", - "Example:", - "{", - ' "name": "user-preferences-lookup",', - ' "type": "function",', - ' "description": "Looks up user preferences by ID",', - ' "parameters": {', - ' "type": "object",', - ' "properties": {', - ' "user_id": {', - ' "type": "string",', - ' "description": "The user id"', - " }", - " },", - ' "required": ["user_id"],', - ' "additionalProperties": false', - " }", - "}", - ]) - - parameters_instructions = "\n".join([ - "Return these values in a JSON object with the following keys: current_instructions, current_parameters, and model.", - "Example:", - "{", - ' "current_instructions": "...', - ' "current_parameters": {', - ' "...": "..."', - " },", - ' "model": "gpt-4o"', - "}", - "Parameters should only be things that are directly parseable by an LLM call, for example, temperature, max_tokens, etc." - "Do not include any other parameters that are not directly parseable by an LLM call. If you want to provide instruction for tone or other attributes, provide them directly in the instructions.", - ]) + variable_instructions = "\n".join( + [ + "## Prompt Variables:", + "These variables are substituted into the instructions at call time using {{variable_name}} syntax.", + "Rules:", + "- If the {{variable_name}} placeholder is not present in the current instructions, you should include it where logically appropriate.", + "Here are the original instructions so that you can see how the placeholders are used and which are available:", + "\nSTART:" "\n" + self._initial_instructions + "\n", + "\nEND OF ORIGINAL INSTRUCTIONS\n", + f"The following prompt variables are available and are the only variables that should be used: {placeholder_list}" + "Here is an example of a good response if an {{id}} placeholder is available: 'Select records matching id {{id}}'", + "Here is an example of a bad response if an {{id}} placeholder is available: 'Select records matching id 1232'", + "Here is an example of a good response if a {{resource_id}} and {{resource_type}} placeholder are available: 'Select records matching id {{resource_id}} and type {{resource_type}}'", + "Here is an example of a bad response if a {{resource_id}} and {{resource_type}} placeholder are available: 'Select records matching id 1232 and type {{resource_type}}'", + "Here is another example of a bad response if a {{resource_id}} and {{resource_type}} placeholder are available: 'Select records matching id {{resource_id}} and type resource-123'", + ] + ) + + tool_instructions = "\n".join( + [ + "## Tool Format:", + 'If the current configuration includes tools, you MUST return them unchanged in current_parameters["tools"].', + "Do NOT include internal framework tools such as the evaluation tool or structured output tool.", + "Each tool must follow this exact format:", + "{", + ' "name": "tool-name",', + ' "type": "function",', + ' "description": "What the tool does",', + ' "parameters": {', + ' "type": "object",', + ' "properties": {', + ' "param_name": {', + ' "type": "type of the input parameter",', + ' "description": "Description of the parameter"', + " }", + " },", + ' "required": ["param_name"],', + ' "additionalProperties": false', + " }", + "}", + "Example:", + "{", + ' "name": "user-preferences-lookup",', + ' "type": "function",', + ' "description": "Looks up user preferences by ID",', + ' "parameters": {', + ' "type": "object",', + ' "properties": {', + ' "user_id": {', + ' "type": "string",', + ' "description": "The user id"', + " }", + " },", + ' "required": ["user_id"],', + ' "additionalProperties": false', + " }", + "}", + ] + ) + + parameters_instructions = "\n".join( + [ + "Return these values in a JSON object with the following keys: current_instructions, current_parameters, and model.", + "Example:", + "{", + ' "current_instructions": "...', + ' "current_parameters": {', + ' "...": "..."', + " },", + ' "model": "gpt-4o"', + "}", + "Parameters should only be things that are directly parseable by an LLM call, for example, temperature, max_tokens, etc." + "Do not include any other parameters that are not directly parseable by an LLM call. If you want to provide instruction for tone or other attributes, provide them directly in the instructions.", + ] + ) if history: - return "\n".join([ - "## Improvement Instructions:", - "Based on the evaluation history above, generate improved agent instructions and parameters.", - "Focus on addressing the areas where the evaluation failed or scored below threshold.", - "The new configuration should aim to improve the agent's performance on the evaluation criteria.", - model_instructions, - "", - variable_instructions, - "", - tool_instructions, - "", - "Return the improved configuration in a structured format that can be parsed to update:", - "1. The agent instructions (current_instructions)", - "2. The agent parameters (current_parameters)", - "3. The model (model) - you must always return a model, even if it's the same as the current model.", - "4. You should return the tools the user has defined, as-is, on the new parameters. Do not modify them, but make sure you do not include internal tools like the evaluation tool or structured output tool.", - parameters_instructions, - ]) + return "\n".join( + [ + "## Improvement Instructions:", + "Based on the evaluation history above, generate improved agent instructions and parameters.", + "Focus on addressing the areas where the evaluation failed or scored below threshold.", + "The new configuration should aim to improve the agent's performance on the evaluation criteria.", + model_instructions, + "", + variable_instructions, + "", + tool_instructions, + "", + "Return the improved configuration in a structured format that can be parsed to update:", + "1. The agent instructions (current_instructions)", + "2. The agent parameters (current_parameters)", + "3. The model (model) - you must always return a model, even if it's the same as the current model.", + "4. You should return the tools the user has defined, as-is, on the new parameters. Do not modify them, but make sure you do not include internal tools like the evaluation tool or structured output tool.", + parameters_instructions, + ] + ) else: - return "\n".join([ - "Generate an improved version of this configuration.", - model_instructions, - "", - variable_instructions, - "", - tool_instructions, - "", - parameters_instructions, - ]) + return "\n".join( + [ + "Generate an improved version of this configuration.", + model_instructions, + "", + variable_instructions, + "", + tool_instructions, + "", + parameters_instructions, + ] + ) def _apply_new_variation_response( self, @@ -1322,7 +1378,10 @@ async def _run_optimization( iteration += 1 logger.info( "[Iteration %d] -> Starting (attempt %d/%d, model=%s)", - iteration, iteration, self._options.max_attempts, self._current_model, + iteration, + iteration, + self._options.max_attempts, + self._current_model, ) user_input = None if self._options.user_input_options: @@ -1338,24 +1397,33 @@ async def _run_optimization( ) self._safe_status_update("generating", optimize_context, iteration) - optimize_context = await self._execute_agent_turn(optimize_context, iteration) + optimize_context = await self._execute_agent_turn( + optimize_context, iteration + ) # Manual path: on_turn callback gives caller full control over pass/fail if self._options.on_turn is not None: try: on_turn_result = self._options.on_turn(optimize_context) if on_turn_result: - logger.info("[Iteration %d] -> on_turn returned True — turn passed", iteration) + logger.info( + "[Iteration %d] -> on_turn returned True — turn passed", + iteration, + ) return self._handle_success(optimize_context, iteration) else: logger.info( "[Iteration %d] -> on_turn returned False — turn failed (attempt %d/%d)", - iteration, iteration, self._options.max_attempts, + iteration, + iteration, + self._options.max_attempts, ) if iteration >= self._options.max_attempts: return self._handle_failure(optimize_context, iteration) self._history.append(optimize_context) - await self._generate_new_variation(iteration, optimize_context.current_variables) + await self._generate_new_variation( + iteration, optimize_context.current_variables + ) self._safe_status_update( "turn completed", optimize_context, iteration ) @@ -1367,7 +1435,9 @@ async def _run_optimization( if iteration >= self._options.max_attempts: return self._handle_failure(optimize_context, iteration) self._history.append(optimize_context) - await self._generate_new_variation(iteration, optimize_context.current_variables) + await self._generate_new_variation( + iteration, optimize_context.current_variables + ) self._safe_status_update( "turn completed", optimize_context, iteration ) @@ -1376,17 +1446,24 @@ async def _run_optimization( # Auto-path: judge scores determine pass/fail via _evaluate_response passes = self._evaluate_response(optimize_context) if passes: - logger.info("[Iteration %d] -> All judges passed — turn succeeded", iteration) + logger.info( + "[Iteration %d] -> All judges passed — turn succeeded", + iteration, + ) return self._handle_success(optimize_context, iteration) else: logger.info( "[Iteration %d] -> One or more judges failed (attempt %d/%d) — generating new variation", - iteration, iteration, self._options.max_attempts, + iteration, + iteration, + self._options.max_attempts, ) if iteration >= self._options.max_attempts: return self._handle_failure(optimize_context, iteration) self._history.append(optimize_context) - await self._generate_new_variation(iteration, optimize_context.current_variables) + await self._generate_new_variation( + iteration, optimize_context.current_variables + ) self._safe_status_update( "turn completed", optimize_context, iteration ) From 59c7ac74d9088fcdc3f825fb8fb09eb9ee520d32 Mon Sep 17 00:00:00 2001 From: Andrew Klatzke Date: Tue, 31 Mar 2026 09:37:56 -0800 Subject: [PATCH 09/11] chore: break up long lines, add spaces where necessary --- .../src/ldai_optimization/client.py | 106 ++++++++++++------ 1 file changed, 72 insertions(+), 34 deletions(-) diff --git a/packages/optimization/src/ldai_optimization/client.py b/packages/optimization/src/ldai_optimization/client.py index 0f97d63..f29e0cd 100644 --- a/packages/optimization/src/ldai_optimization/client.py +++ b/packages/optimization/src/ldai_optimization/client.py @@ -625,16 +625,19 @@ async def _evaluate_acceptance_judge( # Build instructions for the judge instructions = ( - f"You are a judge that evaluates the response to the user's question.\n\n" - f"Here is the statement that you should evaluate the response against: '{optimization_judge.acceptance_statement}'\n" + "You are a judge that evaluates the response to the user's question.\n\n" + "Here is the statement that you should evaluate the response against: " + f"'{optimization_judge.acceptance_statement}'\n" f"Here is the history of all messages between the user and the assistant: {message_history_text}\n" - f"You should score the response based on how well it meets the acceptance statement using a score between 0.0 and 1.0.\n" - f"A score of 0.0 means it does not match at all, while a score of 1.0 means it matches perfectly.\n" - f"A score of 0.3-0.7 means it matches partially, while a score of 0.7-1.0 means it matches well.\n" - f"A score of 0.0-0.3 means that it does not match well at all. You can return any value between 0.0 and 1.0.\n" - f"You should also provide a rationale for your score.\n" - f"You should call the structured output tool to format your response.\n\n" - f'Example: {{"score": 0.8, "rationale": "The response matches the acceptance statement well."}}' + "You should score the response based on how well it meets the acceptance statement " + "using a score between 0.0 and 1.0.\n" + "A score of 0.0 means it does not match at all, while a score of 1.0 means it matches perfectly.\n" + "A score of 0.3-0.7 means it matches partially, while a score of 0.7-1.0 means it matches well.\n" + "A score of 0.0-0.3 means that it does not match well at all. " + "You can return any value between 0.0 and 1.0.\n" + "You should also provide a rationale for your score.\n" + "You should call the structured output tool to format your response.\n\n" + 'Example: {"score": 0.8, "rationale": "The response matches the acceptance statement well."}' ) if resolved_variables: @@ -643,9 +646,13 @@ async def _evaluate_acceptance_judge( if resolved_agent_tools: tool_names = [t.name for t in resolved_agent_tools] instructions += ( - f"\n\nThe following tools were available to the agent and may be called by you to verify the response: {json.dumps(tool_names)}." - "\nIf verifying the response requires looking up external information, call the appropriate tool before scoring." - "You should only call the tools for the most recent response, and should only call the tools if necessary. Assume that previous feedback will have addressed bad tool call results from prior iterations." + "\n\nThe following tools were available to the agent and " + f"may be called by you to verify the response: {json.dumps(tool_names)}." + "\nIf verifying the response requires looking up external information, " + "call the appropriate tool before scoring. " + "You should only call the tools for the most recent response, " + "and should only call the tools if necessary. " + "Assume that previous feedback will have addressed bad tool call results from prior iterations." ) # Prepend agent tools so the judge can invoke them for verification if needed @@ -790,10 +797,16 @@ def _new_variation_prompt_preamble(self) -> str: "You are an assistant that helps improve agent configurations through iterative optimization.", "", "Your task is to generate improved agent instructions and parameters based on the feedback provided.", - "The feedback you provide should guide the LLM To improve the agent instructions for all possible use cases, not one concrete case.", - "For example, if the feedback is that the agent is not returning the correct records, you should improve the agent instructions to return the correct records for all possible use cases. Not just the one concrete case that was provided in the feedback.", - "When changing the instructions, keep the original intent in mind when it comes to things like the use of variables and placeholders.", - "If the original instructions were to use a placeholder like {{id}}, you should keep the placeholder in the new instructions, not replace it with the actual value. This is the case for all parameterized values (all parameters should appear in each new variation).", + "The feedback you provide should guide the LLM to improve the agent instructions " + "for all possible use cases, not one concrete case.", + "For example, if the feedback is that the agent is not returning the correct records, " + "you should improve the agent instructions to return the correct records for all possible use cases. " + "Not just the one concrete case that was provided in the feedback.", + "When changing the instructions, keep the original intent in mind " + "when it comes to things like the use of variables and placeholders.", + "If the original instructions were to use a placeholder like {{id}}, " + "you should keep the placeholder in the new instructions, not replace it with the actual value. " + "This is the case for all parameterized values (all parameters should appear in each new variation).", "Pay particular attention to the instructions regarding tools and the rules for variables.", ] ) @@ -899,7 +912,10 @@ def _new_variation_prompt_feedback(self, history: List[OptimizationContext]) -> if optimization_judge.threshold is not None: passed = score >= optimization_judge.threshold status = "PASSED" if passed else "FAILED" - feedback_line = f"- {judge_key}: Score {score:.3f} (threshold: {optimization_judge.threshold}) - {status}" + feedback_line = ( + f"- {judge_key}: Score {score:.3f}" + f" (threshold: {optimization_judge.threshold}) - {status}" + ) else: passed = score >= 1.0 status = "PASSED" if passed else "FAILED" @@ -921,9 +937,12 @@ def _new_variation_prompt_improvement_instructions( """ model_instructions = "\n".join( [ - "You may also choose to change the model if you believe that the current model is not performing well or a different model would be better suited for the task. " - f"Here are the models you may choose from: {self._options.model_choices}. You must always return a model property, even if it's the same as the current model.", - "When suggesting a new model, you should provide a rationale for why you believe the new model would be better suited for the task.", + "You may also choose to change the model if you believe that the current model is " + "not performing well or a different model would be better suited for the task. " + f"Here are the models you may choose from: {self._options.model_choices}. " + "You must always return a model property, even if it's the same as the current model.", + "When suggesting a new model, you should provide a rationale for why you believe " + "the new model would be better suited for the task.", ] ) @@ -938,23 +957,35 @@ def _new_variation_prompt_improvement_instructions( "## Prompt Variables:", "These variables are substituted into the instructions at call time using {{variable_name}} syntax.", "Rules:", - "- If the {{variable_name}} placeholder is not present in the current instructions, you should include it where logically appropriate.", - "Here are the original instructions so that you can see how the placeholders are used and which are available:", + "- If the {{variable_name}} placeholder is not present in the current instructions, " + "you should include it where logically appropriate.", + "Here are the original instructions so that you can see how the " + "placeholders are used and which are available:", "\nSTART:" "\n" + self._initial_instructions + "\n", "\nEND OF ORIGINAL INSTRUCTIONS\n", - f"The following prompt variables are available and are the only variables that should be used: {placeholder_list}" - "Here is an example of a good response if an {{id}} placeholder is available: 'Select records matching id {{id}}'", - "Here is an example of a bad response if an {{id}} placeholder is available: 'Select records matching id 1232'", - "Here is an example of a good response if a {{resource_id}} and {{resource_type}} placeholder are available: 'Select records matching id {{resource_id}} and type {{resource_type}}'", - "Here is an example of a bad response if a {{resource_id}} and {{resource_type}} placeholder are available: 'Select records matching id 1232 and type {{resource_type}}'", - "Here is another example of a bad response if a {{resource_id}} and {{resource_type}} placeholder are available: 'Select records matching id {{resource_id}} and type resource-123'", + f"The following prompt variables are available and are the only " + f"variables that should be used: {placeholder_list}", + "Here is an example of a good response if an {{id}} placeholder is available: " + "'Select records matching id {{id}}'", + "Here is an example of a bad response if an {{id}} placeholder is available: " + "'Select records matching id 1232'", + "Here is an example of a good response if a {{resource_id}} and {{resource_type}} " + "placeholder are available: " + "'Select records matching id {{resource_id}} and type {{resource_type}}'", + "Here is an example of a bad response if a {{resource_id}} and {{resource_type}} " + "placeholder are available: " + "'Select records matching id 1232 and type {{resource_type}}'", + "Here is another example of a bad response if a {{resource_id}} and {{resource_type}} " + "placeholder are available: " + "'Select records matching id {{resource_id}} and type resource-123'", ] ) tool_instructions = "\n".join( [ "## Tool Format:", - 'If the current configuration includes tools, you MUST return them unchanged in current_parameters["tools"].', + 'If the current configuration includes tools, you MUST return them ' + 'unchanged in current_parameters["tools"].', "Do NOT include internal framework tools such as the evaluation tool or structured output tool.", "Each tool must follow this exact format:", "{", @@ -995,7 +1026,8 @@ def _new_variation_prompt_improvement_instructions( parameters_instructions = "\n".join( [ - "Return these values in a JSON object with the following keys: current_instructions, current_parameters, and model.", + "Return these values in a JSON object with the following keys: " + "current_instructions, current_parameters, and model.", "Example:", "{", ' "current_instructions": "...', @@ -1004,8 +1036,11 @@ def _new_variation_prompt_improvement_instructions( " },", ' "model": "gpt-4o"', "}", - "Parameters should only be things that are directly parseable by an LLM call, for example, temperature, max_tokens, etc." - "Do not include any other parameters that are not directly parseable by an LLM call. If you want to provide instruction for tone or other attributes, provide them directly in the instructions.", + "Parameters should only be things that are directly parseable by an LLM call, " + "for example, temperature, max_tokens, etc.", + "Do not include any other parameters that are not directly parseable by an LLM call. " + "If you want to provide instruction for tone or other attributes, " + "provide them directly in the instructions.", ] ) @@ -1025,8 +1060,11 @@ def _new_variation_prompt_improvement_instructions( "Return the improved configuration in a structured format that can be parsed to update:", "1. The agent instructions (current_instructions)", "2. The agent parameters (current_parameters)", - "3. The model (model) - you must always return a model, even if it's the same as the current model.", - "4. You should return the tools the user has defined, as-is, on the new parameters. Do not modify them, but make sure you do not include internal tools like the evaluation tool or structured output tool.", + "3. The model (model) - you must always return a model, " + "even if it's the same as the current model.", + "4. You should return the tools the user has defined, as-is, on the new parameters. " + "Do not modify them, but make sure you do not include internal tools like " + "the evaluation tool or structured output tool.", parameters_instructions, ] ) From 59f03f2ab3337e2dc7be2e72df6fe585f7155946 Mon Sep 17 00:00:00 2001 From: Andrew Klatzke Date: Tue, 31 Mar 2026 09:39:24 -0800 Subject: [PATCH 10/11] chore: break up another long line --- packages/optimization/src/ldai_optimization/dataclasses.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/optimization/src/ldai_optimization/dataclasses.py b/packages/optimization/src/ldai_optimization/dataclasses.py index b8f6e93..944f7ec 100644 --- a/packages/optimization/src/ldai_optimization/dataclasses.py +++ b/packages/optimization/src/ldai_optimization/dataclasses.py @@ -144,7 +144,8 @@ class OptimizationContext: completion_response: str current_instructions: str current_parameters: Dict[str, Any] - current_variables: Dict[str, Any] # variable set chosen for this iteration; interpolated into instructions at call time + # variable set chosen for this iteration; interpolated into instructions at call time + current_variables: Dict[str, Any] current_model: Optional[str] = None # the current model being used user_input: Optional[str] = None # the user input message for this iteration history: Sequence[OptimizationContext] = field( From e2ff5617572de05b6ce657e5da1a84e938a5bc34 Mon Sep 17 00:00:00 2001 From: Andrew Klatzke Date: Tue, 31 Mar 2026 09:47:02 -0800 Subject: [PATCH 11/11] chore: fix on_turn path --- .../src/ldai_optimization/client.py | 68 ++++++++++--------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/packages/optimization/src/ldai_optimization/client.py b/packages/optimization/src/ldai_optimization/client.py index f29e0cd..6ebd8bb 100644 --- a/packages/optimization/src/ldai_optimization/client.py +++ b/packages/optimization/src/ldai_optimization/client.py @@ -1443,43 +1443,39 @@ async def _run_optimization( if self._options.on_turn is not None: try: on_turn_result = self._options.on_turn(optimize_context) - if on_turn_result: - logger.info( - "[Iteration %d] -> on_turn returned True — turn passed", - iteration, - ) - return self._handle_success(optimize_context, iteration) - else: - logger.info( - "[Iteration %d] -> on_turn returned False — turn failed (attempt %d/%d)", - iteration, - iteration, - self._options.max_attempts, - ) - if iteration >= self._options.max_attempts: - return self._handle_failure(optimize_context, iteration) - self._history.append(optimize_context) - await self._generate_new_variation( - iteration, optimize_context.current_variables - ) - self._safe_status_update( - "turn completed", optimize_context, iteration - ) - continue - except Exception as e: + except Exception: logger.exception( "[Iteration %d] -> on_turn evaluation failed", iteration ) - if iteration >= self._options.max_attempts: - return self._handle_failure(optimize_context, iteration) - self._history.append(optimize_context) + on_turn_result = False + + if on_turn_result: + logger.info( + "[Iteration %d] -> on_turn returned True — turn passed", + iteration, + ) + return self._handle_success(optimize_context, iteration) + + logger.info( + "[Iteration %d] -> on_turn returned False — turn failed (attempt %d/%d)", + iteration, + iteration, + self._options.max_attempts, + ) + if iteration >= self._options.max_attempts: + return self._handle_failure(optimize_context, iteration) + self._history.append(optimize_context) + try: await self._generate_new_variation( iteration, optimize_context.current_variables ) - self._safe_status_update( - "turn completed", optimize_context, iteration + except Exception: + logger.exception( + "[Iteration %d] -> variation generation failed", iteration ) - continue + return self._handle_failure(optimize_context, iteration) + self._safe_status_update("turn completed", optimize_context, iteration) + continue else: # Auto-path: judge scores determine pass/fail via _evaluate_response passes = self._evaluate_response(optimize_context) @@ -1499,9 +1495,15 @@ async def _run_optimization( if iteration >= self._options.max_attempts: return self._handle_failure(optimize_context, iteration) self._history.append(optimize_context) - await self._generate_new_variation( - iteration, optimize_context.current_variables - ) + try: + await self._generate_new_variation( + iteration, optimize_context.current_variables + ) + except Exception: + logger.exception( + "[Iteration %d] -> variation generation failed", iteration + ) + return self._handle_failure(optimize_context, iteration) self._safe_status_update( "turn completed", optimize_context, iteration )