diff --git a/packages/optimization/src/ldai_optimizer/client.py b/packages/optimization/src/ldai_optimizer/client.py index 7c34231d..7a901660 100644 --- a/packages/optimization/src/ldai_optimizer/client.py +++ b/packages/optimization/src/ldai_optimizer/client.py @@ -20,6 +20,7 @@ import logging import os import random +import re import time import uuid from typing import Any, Dict, List, Literal, Optional, Tuple, Union @@ -49,8 +50,6 @@ LDApiClient, ) from ldai_optimizer.prompts import ( - _acceptance_criteria_implies_cost_optimization, - _acceptance_criteria_implies_duration_optimization, build_message_history_text, build_new_variation_prompt, build_reasoning_history, @@ -71,6 +70,15 @@ logger.addFilter(RedactionFilter()) +def _interpolate(template: str, variables: Dict[str, Any]) -> str: + """Replace {{key}} tokens with values from variables; unresolved tokens become empty string.""" + return re.sub( + r"\{\{(\w+)\}\}", + lambda m: str(variables.get(m.group(1), "")), + template, + ) + + def _find_model_config( model_name: str, configs: List[Dict[str, Any]] ) -> Optional[Dict[str, Any]]: @@ -404,18 +412,65 @@ def _judge_config( variables: Dict[str, Any], ) -> AIJudgeConfig: """ - Fetch a judge configuration from the LaunchDarkly client. + Fetch a judge configuration by evaluating the flag variation directly. - Thin wrapper around LDAIClient.judge_config so callers do not need a - direct reference to the client. + Bypasses LDAIClient.judge_config to avoid the reserved-variable warnings + for 'message_history' and 'response_to_evaluate'. Those variables are + interpolated here with their actual values instead of being neutralised + by the SDK. If the template contains only a system message, a user turn + is synthesised from the provided message_history and response_to_evaluate + so that _evaluate_config_judge always receives a complete conversation. :param judge_key: The key for the judge configuration in LaunchDarkly :param context: The evaluation context - :param default: Fallback config when the flag is disabled or unreachable - :param variables: Template variables for instruction interpolation + :param default: Unused; kept for signature compatibility + :param variables: Template variables including message_history and response_to_evaluate :return: The resolved AIJudgeConfig """ - return self._ldClient.judge_config(judge_key, context, default, variables) + variation: Dict[str, Any] = self._ldClient._client.variation(judge_key, context, {}) + enabled: bool = bool(variation.get("_ldMeta", {}).get("enabled", False)) + + all_variables: Dict[str, Any] = {"ldctx": context.to_dict(), **variables} + + messages: List[LDMessage] = [] + raw_messages = variation.get("messages") + if isinstance(raw_messages, list) and all(isinstance(m, dict) for m in raw_messages): + messages = [ + LDMessage( + role=m["role"], + content=_interpolate(m.get("content", ""), all_variables), + ) + for m in raw_messages + ] + + # New-style templates only have a system message. Auto-generate a user + # turn so _evaluate_config_judge always has a complete conversation to split. + if not any(m.role == "user" for m in messages): + message_history = variables.get("message_history", "") + response_to_evaluate = variables.get("response_to_evaluate", "") + parts: List[str] = [] + if message_history: + parts.append(str(message_history)) + parts.append(f"Here is the response to evaluate: {response_to_evaluate}") + messages.append(LDMessage(role="user", content="\n\n".join(parts))) + + model: Optional[ModelConfig] = None + raw_model = variation.get("model") + if isinstance(raw_model, dict): + model = ModelConfig( + name=raw_model.get("name", ""), + parameters=raw_model.get("parameters"), + custom=raw_model.get("custom"), + ) + + return AIJudgeConfig( + key=judge_key, + enabled=enabled, + create_tracker=lambda: None, + model=model, + messages=messages, + evaluation_metric_key=variation.get("evaluationMetricKey"), + ) def _serialize_scores( self, judge_results: Dict[str, JudgeResult] @@ -850,9 +905,7 @@ async def _evaluate_acceptance_judge( if ( agent_duration_ms is not None - and _acceptance_criteria_implies_duration_optimization( - {judge_key: optimization_judge} - ) + and bool(self._options.latency_optimization) ): baseline_ms = self._baseline_duration_ms instructions += ( @@ -875,7 +928,7 @@ async def _evaluate_acceptance_judge( "These suggestions will be used directly to generate the next variation." ) - if _acceptance_criteria_implies_cost_optimization({judge_key: optimization_judge}): + if bool(self._options.token_optimization): current_cost = estimate_cost( agent_usage, _find_model_config(self._current_model or "", self._model_configs), @@ -975,7 +1028,12 @@ async def _evaluate_acceptance_judge( return dataclasses.replace(judge_result, duration_ms=judge_duration_ms, usage=judge_response.usage) async def _get_agent_config( - self, agent_key: str, context: Context + self, + agent_key: str, + context: Context, + variation_key: Optional[str] = None, + project_key: Optional[str] = None, + base_url: Optional[str] = None, ) -> AIAgentConfig: """ Fetch the agent configuration, replacing the instructions with the raw variation @@ -985,16 +1043,39 @@ async def _get_agent_config( (including the tracker). We then call variation() separately to retrieve the unrendered instruction template and swap it in, keeping everything else intact. + When ``variation_key`` is provided the specific variation is fetched via the + LaunchDarkly REST API instead of using the SDK's default flag evaluation. + :param agent_key: The key for the agent to get the configuration for :param context: The evaluation context + :param variation_key: Optional specific variation key to use as the base + :param project_key: LaunchDarkly project key; required when variation_key is set + :param base_url: Optional API base URL override :return: AIAgentConfig with raw {{placeholder}} instruction templates intact """ try: agent_config = self._ldClient.agent_config(agent_key, context) - # variation() returns the raw JSON before chevron.render(), so instructions - # still contain {{placeholder}} tokens rather than empty strings. - raw_variation = self._ldClient._client.variation(agent_key, context, {}) + if variation_key: + assert self._api_key is not None + api_client = LDApiClient( + self._api_key, + **({"base_url": base_url} if base_url else {}), + ) + ai_config = api_client.get_ai_config(project_key, agent_key) + match = next( + (v for v in (ai_config or {}).get("variations", []) if v.get("key") == variation_key), + None, + ) + if match is None: + raise ValueError( + f"variation_key '{variation_key}' not found in agent config '{agent_key}'" + ) + raw_variation = match + else: + # variation() returns the raw JSON before chevron.render(), so instructions + # still contain {{placeholder}} tokens rather than empty strings. + raw_variation = self._ldClient._client.variation(agent_key, context, {}) raw_instructions = raw_variation.get( "instructions", agent_config.instructions ) @@ -1030,20 +1111,20 @@ def _fetch_model_configs( self, project_key: Optional[str], base_url: Optional[str], - judges: Optional[Dict[str, "OptimizationJudge"]], + token_optimization: Optional[bool], ) -> None: """Populate ``_model_configs`` from the LD API when credentials are available. When an API key and project key are both present, fetches the model pricing catalogue so that ``estimate_cost`` can produce USD figures and the cost gate can make meaningful comparisons. If either is absent, ``_model_configs`` is - reset to an empty list and a warning is emitted when cost judges are in use — - cost optimization will silently pass rather than blocking the run. + reset to an empty list and a warning is emitted when token_optimization is + enabled — cost data will be unavailable and the cost gate will pass unconditionally. :param project_key: LaunchDarkly project key, or None if not provided. :param base_url: Optional API base URL override. - :param judges: Judge map from the caller's options, used only to decide - whether a cost-related warning is appropriate. + :param token_optimization: Whether token/cost optimization is enabled; used only to + decide whether a cost-related warning is appropriate. """ self._model_configs = [] if self._has_api_key and project_key: @@ -1056,9 +1137,9 @@ def _fetch_model_configs( self._model_configs = api_client.get_model_configs(project_key) except Exception as exc: logger.debug("Could not pre-fetch model configs: %s", exc) - elif _acceptance_criteria_implies_cost_optimization(judges or {}): + elif token_optimization: logger.warning( - "Cost optimization requires LAUNCHDARKLY_API_KEY and project_key to be set; " + "Token optimization requires LAUNCHDARKLY_API_KEY and project_key to be set; " "cost data will not be available and the cost gate will pass unconditionally" ) @@ -1080,10 +1161,24 @@ async def optimize_from_options( raise ValueError( "auto_commit requires project_key to be set on OptimizationOptions" ) + if options.variation_key: + if not self._has_api_key: + raise ValueError( + "variation_key requires LAUNCHDARKLY_API_KEY to be set" + ) + if not options.project_key: + raise ValueError( + "variation_key requires project_key to be set on OptimizationOptions" + ) self._agent_key = agent_key - self._fetch_model_configs(options.project_key, options.base_url, options.judges) + self._fetch_model_configs(options.project_key, options.base_url, options.token_optimization) context = random.choice(options.context_choices) - agent_config = await self._get_agent_config(agent_key, context) + agent_config = await self._get_agent_config( + agent_key, context, + variation_key=options.variation_key, + project_key=options.project_key, + base_url=options.base_url, + ) result = await self._run_optimization(agent_config, options) if options.auto_commit and self._last_run_succeeded and self._last_succeeded_context: self._commit_variation( @@ -1119,10 +1214,24 @@ async def optimize_from_ground_truth_options( raise ValueError( "auto_commit requires project_key to be set on GroundTruthOptimizationOptions" ) + if options.variation_key: + if not self._has_api_key: + raise ValueError( + "variation_key requires LAUNCHDARKLY_API_KEY to be set" + ) + if not options.project_key: + raise ValueError( + "variation_key requires project_key to be set on GroundTruthOptimizationOptions" + ) self._agent_key = agent_key - self._fetch_model_configs(options.project_key, options.base_url, options.judges) + self._fetch_model_configs(options.project_key, options.base_url, options.token_optimization) context = random.choice(options.context_choices) - agent_config = await self._get_agent_config(agent_key, context) + agent_config = await self._get_agent_config( + agent_key, context, + variation_key=options.variation_key, + project_key=options.project_key, + base_url=options.base_url, + ) result = await self._run_ground_truth_optimization(agent_config, options) if options.auto_commit and self._last_run_succeeded and self._last_succeeded_context: self._commit_variation( @@ -1162,6 +1271,8 @@ async def _run_ground_truth_optimization( on_failing_result=gt_options.on_failing_result, on_status_update=gt_options.on_status_update, token_limit=gt_options.token_limit, + latency_optimization=gt_options.latency_optimization, + token_optimization=gt_options.token_optimization, ) self._options = bridge self._agent_config = agent_config @@ -1579,12 +1690,8 @@ async def _generate_new_variation( ) self._safe_status_update("generating variation", status_ctx, iteration) - optimize_for_duration = _acceptance_criteria_implies_duration_optimization( - self._options.judges - ) - optimize_for_cost = _acceptance_criteria_implies_cost_optimization( - self._options.judges - ) + optimize_for_duration = bool(self._options.latency_optimization) + optimize_for_cost = bool(self._options.token_optimization) quality_already_passing = self._all_judges_passing() instructions = build_new_variation_prompt( self._history, @@ -1708,7 +1815,7 @@ async def optimize_from_config( else: result = await self._run_optimization(agent_config, optimization_options) - if options.auto_commit and self._last_run_succeeded and self._last_succeeded_context: + if optimization_options.auto_commit and options.auto_commit and self._last_run_succeeded and self._last_succeeded_context: created_key = self._commit_variation( self._last_succeeded_context, project_key=options.project_key, @@ -1989,6 +2096,9 @@ def _persist_and_forward( on_failing_result=options.on_failing_result, on_status_update=_persist_and_forward, token_limit=config.get("tokenLimit"), + latency_optimization=config.get("latencyOptimization"), + token_optimization=config.get("tokenOptimization"), + auto_commit=config.get("autoCommit", True), ) variable_choices: List[Dict[str, Any]] = config["variableChoices"] or [{}] @@ -2009,6 +2119,9 @@ def _persist_and_forward( on_failing_result=options.on_failing_result, on_status_update=_persist_and_forward, token_limit=config.get("tokenLimit"), + latency_optimization=config.get("latencyOptimization"), + token_optimization=config.get("tokenOptimization"), + auto_commit=config.get("autoCommit", True), ) async def _execute_agent_turn( @@ -2269,7 +2382,7 @@ def _apply_duration_gate( :param ctx: Current optimization context. :return: (passed, updated_ctx) where passed reflects gate outcome. """ - if not _acceptance_criteria_implies_duration_optimization(self._options.judges): + if not bool(self._options.latency_optimization): return passed_so_far, ctx passed = self._evaluate_duration(ctx) if passed: @@ -2323,7 +2436,7 @@ def _apply_cost_gate( :param ctx: Current optimization context. :return: (passed, updated_ctx) where passed reflects gate outcome. """ - if not _acceptance_criteria_implies_cost_optimization(self._options.judges): + if not bool(self._options.token_optimization): return passed_so_far, ctx passed = self._evaluate_cost(ctx) if passed: diff --git a/packages/optimization/src/ldai_optimizer/dataclasses.py b/packages/optimization/src/ldai_optimizer/dataclasses.py index eb206d90..2d45b909 100644 --- a/packages/optimization/src/ldai_optimizer/dataclasses.py +++ b/packages/optimization/src/ldai_optimizer/dataclasses.py @@ -348,9 +348,14 @@ class OptimizationOptions: context_choices: List[Context] = field( default_factory=lambda: [Context.builder("anonymous").anonymous(True).build()] ) + # Base variation - Optional + variation_key: Optional[str] = None # use this specific variation as the base; defaults to the flag's default variation; requires API key + project_key + # Optimization controls - Optional; when None the corresponding gate/prompt is disabled + latency_optimization: Optional[bool] = None + token_optimization: Optional[bool] = None # Auto-commit - Optional auto_commit: bool = False - project_key: Optional[str] = None # required when auto_commit=True + project_key: Optional[str] = None # required when auto_commit=True or variation_key is set output_key: Optional[str] = None # variation key/name; auto-generated if omitted base_url: Optional[str] = None # override to target a non-default LD instance on_passing_result: Optional[Callable[[OptimizationContext], None]] = None @@ -440,9 +445,14 @@ class GroundTruthOptimizationOptions: context_choices: List[Context] = field( default_factory=lambda: [Context.builder("anonymous").anonymous(True).build()] ) + # Base variation - Optional + variation_key: Optional[str] = None # use this specific variation as the base; defaults to the flag's default variation; requires API key + project_key + # Optimization controls - Optional; when None the corresponding gate/prompt is disabled + latency_optimization: Optional[bool] = None + token_optimization: Optional[bool] = None # Auto-commit - Optional auto_commit: bool = False - project_key: Optional[str] = None # required when auto_commit=True + project_key: Optional[str] = None # required when auto_commit=True or variation_key is set output_key: Optional[str] = None # variation key/name; auto-generated if omitted base_url: Optional[str] = None # override to target a non-default LD instance token_limit: Optional[int] = None # stop the run when total token usage reaches this value diff --git a/packages/optimization/src/ldai_optimizer/ld_api_client.py b/packages/optimization/src/ldai_optimizer/ld_api_client.py index 37f6549e..14843d90 100644 --- a/packages/optimization/src/ldai_optimizer/ld_api_client.py +++ b/packages/optimization/src/ldai_optimizer/ld_api_client.py @@ -90,6 +90,9 @@ class AgentOptimizationConfig(_AgentOptimizationConfigRequired, total=False): groundTruthResponses: List[str] metricKey: str tokenLimit: int + latencyOptimization: bool + tokenOptimization: bool + autoCommit: bool # --------------------------------------------------------------------------- diff --git a/packages/optimization/src/ldai_optimizer/prompts.py b/packages/optimization/src/ldai_optimizer/prompts.py index 9ba37d94..9e42ca49 100644 --- a/packages/optimization/src/ldai_optimizer/prompts.py +++ b/packages/optimization/src/ldai_optimizer/prompts.py @@ -1,6 +1,5 @@ """Prompt-building functions for LaunchDarkly AI optimization.""" -import re from typing import Any, Dict, List, Optional from ldai_optimizer.dataclasses import ( @@ -9,64 +8,6 @@ ) from ldai_optimizer.util import judge_passed -_DURATION_KEYWORDS = re.compile( - r"\b(fast|faster|quickly|quick|latency|low-latency|duration|response\s+time|" - r"time\s+to\s+respond|milliseconds|performant|snappy|efficient|seconds)\b|" - r"(? bool: - """Return True if any judge acceptance statement implies a latency optimization goal. - - Scans each judge's acceptance_statement for latency-related keywords. The - check is case-insensitive. Returns False when judges is None or no judge - carries an acceptance statement. - - :param judges: Judge configuration dict from OptimizationOptions, or None. - :return: True if duration optimization should be applied. - """ - if not judges: - return False - for judge in judges.values(): - if judge.acceptance_statement and _DURATION_KEYWORDS.search( - judge.acceptance_statement - ): - return True - return False - - -def _acceptance_criteria_implies_cost_optimization( - judges: Optional[Dict[str, OptimizationJudge]], -) -> bool: - """Return True if any judge acceptance statement implies a cost reduction goal. - - Scans each judge's acceptance_statement for cost-related keywords. The - check is case-insensitive. Returns False when judges is None or no judge - carries an acceptance statement. - - :param judges: Judge configuration dict from OptimizationOptions, or None. - :return: True if cost optimization should be applied. - """ - if not judges: - return False - for judge in judges.values(): - if judge.acceptance_statement and _COST_KEYWORDS.search( - judge.acceptance_statement - ): - return True - return False - def build_message_history_text( history: List[OptimizationContext], diff --git a/packages/optimization/tests/test_client.py b/packages/optimization/tests/test_client.py index df9c6f6f..5a1d9895 100644 --- a/packages/optimization/tests/test_client.py +++ b/packages/optimization/tests/test_client.py @@ -6,7 +6,7 @@ from unittest.mock import AsyncMock, MagicMock, patch import pytest -from ldai import AIAgentConfig, AIJudgeConfig, LDAIClient +from ldai import AIAgentConfig, LDAIClient from ldai.client import Evaluator from ldai.models import LDMessage, ModelConfig from ldai.tracker import TokenUsage @@ -35,8 +35,6 @@ ToolDefinition, ) from ldai_optimizer.prompts import ( - _acceptance_criteria_implies_cost_optimization, - _acceptance_criteria_implies_duration_optimization, build_new_variation_prompt, variation_prompt_acceptance_criteria, variation_prompt_cost_optimization, @@ -561,13 +559,13 @@ async def test_variables_in_context(self): _, _, ctx, _ = call_args.args assert ctx.current_variables == variables - async def test_duration_context_added_to_instructions_when_latency_keyword_present(self): - """When acceptance statement has a latency keyword and agent_duration_ms is provided, - the instructions mention the duration.""" - judge = OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be fast.", + async def test_duration_context_added_when_latency_optimization_true_and_duration_provided(self): + """When latency_optimization=True and agent_duration_ms is provided, + the judge instructions mention the duration.""" + self.client._options = _make_options( + handle_judge_call=self.handle_judge_call, latency_optimization=True ) + judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.") await self.client._evaluate_acceptance_judge( judge_key="speed", optimization_judge=judge, @@ -583,6 +581,9 @@ async def test_duration_context_added_to_instructions_when_latency_keyword_prese async def test_duration_context_includes_baseline_comparison_when_history_present(self): """When a baseline duration is captured, the judge instructions include a baseline comparison.""" + self.client._options = _make_options( + handle_judge_call=self.handle_judge_call, latency_optimization=True + ) self.client._history = [ OptimizationContext( scores={}, @@ -595,10 +596,7 @@ async def test_duration_context_includes_baseline_comparison_when_history_presen ) ] self.client._baseline_duration_ms = 2000.0 - judge = OptimizationJudge( - threshold=0.8, - acceptance_statement="Responses should have low latency.", - ) + judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.") await self.client._evaluate_acceptance_judge( judge_key="latency", optimization_judge=judge, @@ -615,6 +613,9 @@ async def test_duration_context_includes_baseline_comparison_when_history_presen async def test_duration_context_says_slower_when_candidate_is_slower(self): """When the candidate is slower than baseline, the instructions say 'slower'.""" + self.client._options = _make_options( + handle_judge_call=self.handle_judge_call, latency_optimization=True + ) self.client._history = [ OptimizationContext( scores={}, @@ -627,10 +628,7 @@ async def test_duration_context_says_slower_when_candidate_is_slower(self): ) ] self.client._baseline_duration_ms = 1000.0 - judge = OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be fast.", - ) + judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.") await self.client._evaluate_acceptance_judge( judge_key="speed", optimization_judge=judge, @@ -643,12 +641,9 @@ async def test_duration_context_says_slower_when_candidate_is_slower(self): _, config, _, _ = self.handle_judge_call.call_args.args assert "slower" in config.instructions - async def test_duration_context_not_added_when_no_latency_keyword(self): - """When acceptance statement has no latency keyword, duration is not injected.""" - judge = OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be accurate.", - ) + async def test_duration_context_not_added_when_latency_optimization_is_none(self): + """When latency_optimization is None (not set), duration is not injected.""" + judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.") await self.client._evaluate_acceptance_judge( judge_key="accuracy", optimization_judge=judge, @@ -660,14 +655,13 @@ async def test_duration_context_not_added_when_no_latency_keyword(self): ) _, config, _, _ = self.handle_judge_call.call_args.args assert "2000ms" not in config.instructions - assert "duration" not in config.instructions.lower() or "acceptance" in config.instructions.lower() async def test_duration_context_not_added_when_agent_duration_ms_is_none(self): - """When agent_duration_ms is None, no duration block is added even if keyword matches.""" - judge = OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be fast.", + """When agent_duration_ms is None, no duration block is added even if latency_optimization=True.""" + self.client._options = _make_options( + handle_judge_call=self.handle_judge_call, latency_optimization=True ) + judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.") await self.client._evaluate_acceptance_judge( judge_key="speed", optimization_judge=judge, @@ -723,20 +717,19 @@ def setup_method(self): self.handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)) self.client._options = _make_options(handle_judge_call=self.handle_judge_call) - def _make_judge_config(self, enabled: bool = True) -> AIJudgeConfig: - return AIJudgeConfig( - key="ld-judge-key", - enabled=enabled, - create_tracker=MagicMock, - model=ModelConfig(name="gpt-4o", parameters={}), - messages=[ - LDMessage(role="system", content="You are an evaluator."), - LDMessage(role="user", content="Evaluate this response."), + def _make_raw_variation(self, enabled: bool = True) -> Dict[str, Any]: + """Raw variation dict as returned by _client.variation for a judge flag.""" + return { + "_ldMeta": {"enabled": enabled}, + "messages": [ + {"role": "system", "content": "You are an evaluator."}, + {"role": "user", "content": "Evaluate this response."}, ], - ) + "model": {"name": "gpt-4o", "parameters": {}}, + } async def test_calls_handle_judge_call_with_correct_config_type(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + self.mock_ldai._client.variation.return_value = self._make_raw_variation() judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge( judge_key="quality", @@ -754,7 +747,7 @@ async def test_calls_handle_judge_call_with_correct_config_type(self): assert isinstance(ctx, OptimizationJudgeContext) async def test_messages_has_system_and_user_turns(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + self.mock_ldai._client.variation.return_value = self._make_raw_variation() judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge( judge_key="quality", @@ -769,7 +762,7 @@ async def test_messages_has_system_and_user_turns(self): assert roles == ["system", "user"] async def test_messages_system_content_matches_instructions(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + self.mock_ldai._client.variation.return_value = self._make_raw_variation() judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge( judge_key="quality", @@ -784,7 +777,7 @@ async def test_messages_system_content_matches_instructions(self): assert system_msg.content == config.instructions async def test_messages_user_content_matches_context_user_input(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + self.mock_ldai._client.variation.return_value = self._make_raw_variation() judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge( judge_key="quality", @@ -799,7 +792,7 @@ async def test_messages_user_content_matches_context_user_input(self): assert user_msg.content == ctx.user_input async def test_messages_user_content_contains_ld_user_message(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + self.mock_ldai._client.variation.return_value = self._make_raw_variation() judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge( judge_key="quality", @@ -814,7 +807,7 @@ async def test_messages_user_content_contains_ld_user_message(self): assert "Evaluate this response." in user_msg.content async def test_returns_zero_score_when_judge_disabled(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config(enabled=False) + self.mock_ldai._client.variation.return_value = self._make_raw_variation(enabled=False) judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") result = await self.client._evaluate_config_judge( judge_key="quality", @@ -827,31 +820,37 @@ async def test_returns_zero_score_when_judge_disabled(self): assert result.score == 0.0 self.handle_judge_call.assert_not_called() - async def test_returns_zero_score_when_judge_has_no_messages(self): - judge_config = AIJudgeConfig( - key="ld-judge-key", - enabled=True, - create_tracker=MagicMock, - model=ModelConfig(name="gpt-4o", parameters={}), - messages=None, - ) - self.mock_ldai.judge_config.return_value = judge_config + async def test_system_only_template_auto_generates_user_message(self): + """When the flag template has only a system message, a user turn is synthesised.""" + self.mock_ldai._client.variation.return_value = { + "_ldMeta": {"enabled": True}, + "messages": [{"role": "system", "content": "You are an evaluator."}], + "model": {"name": "gpt-4o", "parameters": {}}, + } judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") - result = await self.client._evaluate_config_judge( + await self.client._evaluate_config_judge( judge_key="quality", optimization_judge=judge, - completion_response="Any.", + completion_response="The answer is 42.", iteration=1, reasoning_history="", - user_input="Anything?", + user_input="What is the answer?", ) - assert result.score == 0.0 - self.handle_judge_call.assert_not_called() - - async def test_template_variables_merged_into_judge_config_call(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + _, config, _, _ = self.handle_judge_call.call_args.args + user_msg = next(m for m in config.messages if m.role == "user") + assert "The answer is 42." in user_msg.content + + async def test_template_variables_interpolated_into_messages(self): + """Custom agent variables are interpolated into judge template messages.""" + self.mock_ldai._client.variation.return_value = { + "_ldMeta": {"enabled": True}, + "messages": [ + {"role": "system", "content": "Evaluate in {{language}}."}, + {"role": "user", "content": "Evaluate this response."}, + ], + "model": {"name": "gpt-4o", "parameters": {}}, + } judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") - variables = {"language": "Spanish"} await self.client._evaluate_config_judge( judge_key="quality", optimization_judge=judge, @@ -859,16 +858,38 @@ async def test_template_variables_merged_into_judge_config_call(self): iteration=1, reasoning_history="", user_input="Q?", - variables=variables, + variables={"language": "Spanish"}, + ) + _, config, _, _ = self.handle_judge_call.call_args.args + assert "Spanish" in config.instructions + + async def test_reserved_variables_interpolated_into_template_messages(self): + """message_history and response_to_evaluate are interpolated when present in the template.""" + self.mock_ldai._client.variation.return_value = { + "_ldMeta": {"enabled": True}, + "messages": [ + {"role": "system", "content": "History: {{message_history}}"}, + {"role": "user", "content": "Response: {{response_to_evaluate}}"}, + ], + "model": {"name": "gpt-4o", "parameters": {}}, + } + judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") + await self.client._evaluate_config_judge( + judge_key="quality", + optimization_judge=judge, + completion_response="My answer.", + iteration=1, + reasoning_history="", + user_input="Q?", ) - call_kwargs = self.mock_ldai.judge_config.call_args - passed_vars = call_kwargs.args[3] if call_kwargs.args else call_kwargs.kwargs.get("variables", {}) - assert passed_vars.get("language") == "Spanish" - assert "message_history" in passed_vars - assert "response_to_evaluate" in passed_vars + _, config, _, _ = self.handle_judge_call.call_args.args + system_msg = next(m for m in config.messages if m.role == "system") + assert "History:" in system_msg.content + user_msg = next(m for m in config.messages if m.role == "user") + assert "My answer." in user_msg.content async def test_agent_tools_included_without_evaluation_tool(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + self.mock_ldai._client.variation.return_value = self._make_raw_variation() agent_tool = ToolDefinition(name="search", description="Search", input_schema={}) judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge( @@ -3862,132 +3883,6 @@ async def test_optimize_from_config_dispatches_to_gt_run(self): assert len(result) == 2 -# --------------------------------------------------------------------------- -# _acceptance_criteria_implies_duration_optimization -# --------------------------------------------------------------------------- - - -class TestAcceptanceCriteriaImpliesDurationOptimization: - def test_returns_false_when_judges_is_none(self): - assert _acceptance_criteria_implies_duration_optimization(None) is False - - def test_returns_false_when_judges_is_empty(self): - assert _acceptance_criteria_implies_duration_optimization({}) is False - - def test_returns_false_when_no_acceptance_statements(self): - judges = {"quality": OptimizationJudge(threshold=0.8, judge_key="judge-1")} - assert _acceptance_criteria_implies_duration_optimization(judges) is False - - def test_returns_false_when_acceptance_statement_has_no_latency_keywords(self): - judges = { - "accuracy": OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be accurate and complete.", - ) - } - assert _acceptance_criteria_implies_duration_optimization(judges) is False - - def test_detects_fast_keyword(self): - judges = { - "speed": OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be fast.", - ) - } - assert _acceptance_criteria_implies_duration_optimization(judges) is True - - def test_detects_faster_keyword(self): - judges = { - "speed": OptimizationJudge( - threshold=0.8, - acceptance_statement="The agent should respond faster.", - ) - } - assert _acceptance_criteria_implies_duration_optimization(judges) is True - - def test_detects_latency_keyword(self): - judges = { - "perf": OptimizationJudge( - threshold=0.8, - acceptance_statement="The agent must have low latency.", - ) - } - assert _acceptance_criteria_implies_duration_optimization(judges) is True - - def test_detects_duration_keyword(self): - judges = { - "perf": OptimizationJudge( - threshold=0.8, - acceptance_statement="Minimize the duration of each response.", - ) - } - assert _acceptance_criteria_implies_duration_optimization(judges) is True - - def test_detects_ms_keyword(self): - judges = { - "perf": OptimizationJudge( - threshold=0.8, - acceptance_statement="Responses should complete in under 500ms.", - ) - } - assert _acceptance_criteria_implies_duration_optimization(judges) is True - - def test_detects_response_time_phrase(self): - judges = { - "perf": OptimizationJudge( - threshold=0.8, - acceptance_statement="The response time should be minimized.", - ) - } - assert _acceptance_criteria_implies_duration_optimization(judges) is True - - def test_detects_efficient_keyword(self): - judges = { - "perf": OptimizationJudge( - threshold=0.8, - acceptance_statement="The model must be efficient.", - ) - } - assert _acceptance_criteria_implies_duration_optimization(judges) is True - - def test_detects_snappy_keyword(self): - judges = { - "perf": OptimizationJudge( - threshold=0.8, - acceptance_statement="Responses should feel snappy.", - ) - } - assert _acceptance_criteria_implies_duration_optimization(judges) is True - - def test_case_insensitive_match(self): - judges = { - "perf": OptimizationJudge( - threshold=0.8, - acceptance_statement="The model must be EFFICIENT and FAST.", - ) - } - assert _acceptance_criteria_implies_duration_optimization(judges) is True - - def test_returns_true_when_any_judge_matches(self): - judges = { - "accuracy": OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be accurate.", - ), - "speed": OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be fast.", - ), - } - assert _acceptance_criteria_implies_duration_optimization(judges) is True - - def test_returns_false_when_acceptance_statement_is_none(self): - judges = { - "quality": OptimizationJudge(threshold=0.8, acceptance_statement=None) - } - assert _acceptance_criteria_implies_duration_optimization(judges) is False - - # --------------------------------------------------------------------------- # _evaluate_duration # --------------------------------------------------------------------------- @@ -4069,17 +3964,9 @@ class TestDurationOptimizationChaosMode: def setup_method(self): self.mock_ldai = _make_ldai_client() - def _duration_judges(self, statement="The response must be fast."): - return { - "speed": OptimizationJudge( - threshold=0.8, - acceptance_statement=statement, - ) - } - def _ctx_with(self, duration_ms, score=1.0, iteration=1): return OptimizationContext( - scores={"speed": JudgeResult(score=score)}, + scores={"accuracy": JudgeResult(score=score)}, completion_response="answer", current_instructions="Do X.", current_parameters={}, @@ -4105,7 +3992,7 @@ async def test_duration_gate_triggers_variation_when_not_fast_enough(self): handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=VARIATION_RESPONSE)) opts = _make_options( handle_agent_call=handle_agent_call, - judges=self._duration_judges(), + latency_optimization=True, max_attempts=5, ) @@ -4131,7 +4018,7 @@ async def test_duration_check_skipped_on_first_iteration_no_baseline(self): opts = _make_options( handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="answer")), - judges=self._duration_judges(), + latency_optimization=True, max_attempts=3, ) @@ -4142,26 +4029,20 @@ async def test_duration_check_skipped_on_first_iteration_no_baseline(self): # Succeeds because history is empty and duration check is skipped assert result.duration_ms == 9999 - async def test_no_duration_gate_when_acceptance_criteria_has_no_latency_keywords(self): - """Acceptance statement with no latency keywords → duration gate never applied.""" + async def test_no_duration_gate_when_latency_optimization_is_none(self): + """latency_optimization=None → duration gate never applied.""" client = _make_client(self.mock_ldai) # Judge passes on first try; duration would fail if gate were applied (same as baseline) - # but since acceptance criteria has no latency keywords, it should succeed anyway + # but since latency_optimization=None, the gate is not applied execute_side_effects = [ self._ctx_with(duration_ms=2000, score=1.0, iteration=1), self._ctx_with(duration_ms=2000, score=1.0, iteration=2), # validation ] - non_latency_judges = { - "accuracy": OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be accurate and complete.", - ) - } opts = _make_options( handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="answer")), - judges=non_latency_judges, + latency_optimization=None, max_attempts=3, ) @@ -4193,7 +4074,7 @@ async def test_evaluate_duration_called_in_validation_phase(self): handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=VARIATION_RESPONSE)) opts = _make_options( handle_agent_call=handle_agent_call, - judges=self._duration_judges(), + latency_optimization=True, max_attempts=5, ) @@ -4214,17 +4095,9 @@ class TestDurationOptimizationGroundTruthMode: def setup_method(self): self.mock_ldai = _make_ldai_client() - def _duration_judges(self): - return { - "speed": OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be fast.", - ) - } - def _gt_ctx(self, duration_ms, score=1.0, iteration=1, user_input="q"): return OptimizationContext( - scores={"speed": JudgeResult(score=score)}, + scores={"acc": JudgeResult(score=score)}, completion_response="answer", current_instructions="Do X.", current_parameters={}, @@ -4268,7 +4141,7 @@ async def test_duration_gate_applied_per_sample_in_ground_truth_mode(self): handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=VARIATION_RESPONSE)) opts = _make_gt_options( handle_agent_call=handle_agent_call, - judges=self._duration_judges(), + latency_optimization=True, max_attempts=5, ) @@ -4283,8 +4156,8 @@ async def test_duration_gate_applied_per_sample_in_ground_truth_mode(self): assert handle_agent_call.call_count == 2 assert mock_execute.call_count == 6 - async def test_no_duration_gate_in_gt_mode_when_no_latency_keywords(self): - """In GT mode, duration gate is not applied when acceptance criteria has no latency keywords.""" + async def test_no_duration_gate_in_gt_mode_when_latency_optimization_not_set(self): + """In GT mode, duration gate is not applied when latency_optimization is None.""" client = _make_client(self.mock_ldai) execute_side_effects = [ @@ -4292,15 +4165,9 @@ async def test_no_duration_gate_in_gt_mode_when_no_latency_keywords(self): self._gt_ctx(duration_ms=5000, score=1.0, iteration=2, user_input="q2"), ] - non_latency_judges = { - "accuracy": OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be accurate.", - ) - } opts = _make_gt_options( handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="answer")), - judges=non_latency_judges, + latency_optimization=None, max_attempts=3, ) @@ -4308,7 +4175,7 @@ async def test_no_duration_gate_in_gt_mode_when_no_latency_keywords(self): mock_execute.side_effect = execute_side_effects results = await client.optimize_from_ground_truth_options("test-agent", opts) - # Succeeds on first attempt even with slow duration (no latency keyword → no gate) + # Succeeds on first attempt even with slow duration (latency_optimization=None → no gate) assert isinstance(results, list) assert mock_execute.call_count == 2 @@ -4946,6 +4813,21 @@ async def test_commit_not_called_when_auto_commit_false(self): mock_commit.assert_not_called() + async def test_commit_not_called_when_api_config_auto_commit_false(self): + """autoCommit: false in the API config suppresses the commit even when + OptimizationFromConfigOptions.auto_commit is True (the default).""" + client = self._make_client_with_key() + mock_api = _make_mock_api_client() + api_config_no_commit = {**_API_CONFIG, "autoCommit": False} + mock_api.get_agent_optimization = MagicMock(return_value=api_config_no_commit) + + with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api): + with patch.object(client, "_commit_variation") as mock_commit: + # options.auto_commit is True (default); commit must still be skipped + await client.optimize_from_config("my-opt", _make_from_config_options()) + + mock_commit.assert_not_called() + async def test_commit_receives_pre_built_api_client(self): """The api_client created for fetching config is reused for _commit_variation.""" client = self._make_client_with_key() @@ -5345,61 +5227,6 @@ def test_returns_partial_cost_when_only_output_count_is_none(self): assert estimate_cost(usage, model_config) == pytest.approx(60 * 0.001) -# --------------------------------------------------------------------------- -# _acceptance_criteria_implies_cost_optimization -# --------------------------------------------------------------------------- - - -class TestAcceptanceCriteriaImpliesCostOptimization: - def _judge(self, statement: str) -> Dict[str, OptimizationJudge]: - return {"j": OptimizationJudge(threshold=0.9, acceptance_statement=statement)} - - def test_returns_false_when_judges_none(self): - assert _acceptance_criteria_implies_cost_optimization(None) is False - - def test_returns_false_when_no_acceptance_statements(self): - judges = {"j": OptimizationJudge(threshold=0.9, judge_key="some-judge")} - assert _acceptance_criteria_implies_cost_optimization(judges) is False - - def test_detects_cheap(self): - assert _acceptance_criteria_implies_cost_optimization(self._judge("Keep it cheap.")) - - def test_detects_cost(self): - assert _acceptance_criteria_implies_cost_optimization(self._judge("Reduce overall cost.")) - - def test_detects_costs_plural(self): - assert _acceptance_criteria_implies_cost_optimization( - self._judge("Keep the costs stable or lower them.") - ) - - def test_detects_budget(self): - assert _acceptance_criteria_implies_cost_optimization(self._judge("Stay within budget.")) - - def test_does_not_detect_token_to_avoid_false_positives(self): - assert not _acceptance_criteria_implies_cost_optimization(self._judge("Generate a valid authentication token.")) - - def test_detects_billing(self): - assert _acceptance_criteria_implies_cost_optimization(self._judge("Minimize billing.")) - - def test_detects_spend(self): - assert _acceptance_criteria_implies_cost_optimization(self._judge("Reduce spend on API calls.")) - - def test_case_insensitive(self): - assert _acceptance_criteria_implies_cost_optimization(self._judge("BUDGET FRIENDLY response")) - - def test_no_match_on_unrelated_statement(self): - assert not _acceptance_criteria_implies_cost_optimization( - self._judge("Respond accurately and concisely.") - ) - - def test_multiple_judges_one_matches(self): - judges = { - "j1": OptimizationJudge(threshold=0.9, acceptance_statement="Be accurate."), - "j2": OptimizationJudge(threshold=0.9, acceptance_statement="Keep costs low."), - } - assert _acceptance_criteria_implies_cost_optimization(judges) - - # --------------------------------------------------------------------------- # _evaluate_cost # --------------------------------------------------------------------------- @@ -5516,22 +5343,6 @@ def test_noop_when_all_values_none(self): class TestApplyDurationGate: """Unit tests for the _apply_duration_gate wrapper method.""" - def _make_judges_with_latency(self): - return { - "latency": OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be faster and reduce latency.", - ) - } - - def _make_judges_no_latency(self): - return { - "accuracy": OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be accurate.", - ) - } - def _ctx(self, duration_ms=None, iteration=2): return OptimizationContext( scores={}, @@ -5545,12 +5356,12 @@ def _ctx(self, duration_ms=None, iteration=2): def setup_method(self): self.client = _make_client() - self.client._options = _make_options(judges=self._make_judges_with_latency()) + self.client._options = _make_options(latency_optimization=True) self.client._initialize_class_members_from_config(_make_agent_config()) self.client._baseline_duration_ms = 2000.0 def test_no_entry_added_when_gate_not_active(self): - self.client._options = _make_options(judges=self._make_judges_no_latency()) + self.client._options = _make_options(latency_optimization=None) ctx = self._ctx(1000) passed, updated = self.client._apply_duration_gate(True, ctx) assert passed is True @@ -5621,22 +5432,6 @@ def test_no_threshold_field_on_judge_result(self): class TestApplyCostGate: """Unit tests for the _apply_cost_gate wrapper method.""" - def _make_judges_with_cost(self): - return { - "cost": OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be cheaper and reduce cost.", - ) - } - - def _make_judges_no_cost(self): - return { - "accuracy": OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be accurate.", - ) - } - def _ctx(self, cost=None, iteration=2): return OptimizationContext( scores={}, @@ -5650,12 +5445,12 @@ def _ctx(self, cost=None, iteration=2): def setup_method(self): self.client = _make_client() - self.client._options = _make_options(judges=self._make_judges_with_cost()) + self.client._options = _make_options(token_optimization=True) self.client._initialize_class_members_from_config(_make_agent_config()) self.client._baseline_cost_usd = 0.010 def test_no_entry_added_when_gate_not_active(self): - self.client._options = _make_options(judges=self._make_judges_no_cost()) + self.client._options = _make_options(token_optimization=None) ctx = self._ctx(0.005) passed, updated = self.client._apply_cost_gate(True, ctx) assert passed is True @@ -5711,12 +5506,8 @@ def test_existing_scores_are_preserved(self): def test_both_gates_active_compose_cleanly(self): """Duration + cost gate can both fire on the same context.""" self.client._options = _make_options( - judges={ - "perf": OptimizationJudge( - threshold=0.8, - acceptance_statement="The response must be faster, reduce latency, and cheaper cost.", - ) - } + latency_optimization=True, + token_optimization=True, ) self.client._baseline_duration_ms = 2000.0 self.client._baseline_cost_usd = 0.010 @@ -6064,7 +5855,7 @@ def _set_pricing(self): {"id": "gpt-4o", "costPerInputToken": 0.000005, "costPerOutputToken": 0.000015} ] - async def test_cost_context_injected_into_instructions(self): + async def test_cost_context_injected_when_token_optimization_true(self): self._set_pricing() usage = TokenUsage(total=100, input=60, output=40) captured: list = [] @@ -6073,7 +5864,9 @@ async def _capture_judge_call(judge_key, judge_config, ctx, is_judge): captured.append(judge_config.instructions) return OptimizationResponse(output=JUDGE_PASS_RESPONSE) - self.client._options = _make_options(handle_judge_call=_capture_judge_call) + self.client._options = _make_options( + handle_judge_call=_capture_judge_call, token_optimization=True + ) await self.client._evaluate_acceptance_judge( judge_key="cost-judge", optimization_judge=self._cost_judge(), @@ -6088,7 +5881,8 @@ async def _capture_judge_call(judge_key, judge_config, ctx, is_judge): assert "60 input tokens" in instructions assert "40 output tokens" in instructions - async def test_cost_context_not_injected_for_non_cost_judge(self): + async def test_cost_context_not_injected_when_token_optimization_false(self): + self._set_pricing() usage = TokenUsage(total=100, input=60, output=40) captured: list = [] @@ -6096,14 +5890,12 @@ async def _capture_judge_call(judge_key, judge_config, ctx, is_judge): captured.append(judge_config.instructions) return OptimizationResponse(output=JUDGE_PASS_RESPONSE) - self.client._options = _make_options(handle_judge_call=_capture_judge_call) - non_cost_judge = OptimizationJudge( - threshold=0.9, - acceptance_statement="Be accurate and concise.", + self.client._options = _make_options( + handle_judge_call=_capture_judge_call, token_optimization=False ) await self.client._evaluate_acceptance_judge( - judge_key="quality-judge", - optimization_judge=non_cost_judge, + judge_key="cost-judge", + optimization_judge=self._cost_judge(), completion_response="response", iteration=1, reasoning_history="", @@ -6112,7 +5904,6 @@ async def _capture_judge_call(judge_key, judge_config, ctx, is_judge): ) assert captured instructions = captured[0] - # The cost-specific augmentation phrase should not appear assert "cost/token-usage goal" not in instructions async def test_baseline_cost_shown_when_history_present(self): @@ -6135,7 +5926,9 @@ async def _capture_judge_call(judge_key, judge_config, ctx, is_judge): ) self.client._history = [baseline_ctx] self.client._baseline_cost_usd = 500.0 - self.client._options = _make_options(handle_judge_call=_capture_judge_call) + self.client._options = _make_options( + handle_judge_call=_capture_judge_call, token_optimization=True + ) await self.client._evaluate_acceptance_judge( judge_key="cost-judge", optimization_judge=self._cost_judge(), @@ -6148,3 +5941,205 @@ async def _capture_judge_call(judge_key, judge_config, ctx, is_judge): assert captured instructions = captured[0] assert "baseline" in instructions.lower() + + +# --------------------------------------------------------------------------- +# variation_key in optimize_from_options +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +class TestVariationKeyInOptimizeFromOptions: + def _make_client_with_key(self) -> OptimizationClient: + with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-api-key"}): + return OptimizationClient(_make_ldai_client()) + + def _make_client_without_key(self) -> OptimizationClient: + client = OptimizationClient(_make_ldai_client()) + client._has_api_key = False + client._api_key = None + return client + + def _make_ai_config_with_variations(self, *keys: str) -> dict: + return { + "variations": [ + {"key": k, "instructions": f"Instructions for {k}.", "mode": "agent"} + for k in keys + ] + } + + async def test_raises_when_variation_key_set_and_no_api_key(self): + client = self._make_client_without_key() + options = _make_options(variation_key="my-variation", project_key="my-project") + + with pytest.raises(ValueError, match="LAUNCHDARKLY_API_KEY"): + await client.optimize_from_options("test-agent", options) + + async def test_raises_when_variation_key_set_and_no_project_key(self): + client = self._make_client_with_key() + options = _make_options(variation_key="my-variation", project_key=None) + + with pytest.raises(ValueError, match="project_key"): + await client.optimize_from_options("test-agent", options) + + async def test_uses_variation_key_as_base_variation(self): + client = self._make_client_with_key() + ai_config = self._make_ai_config_with_variations("v1", "my-variation", "v3") + + with patch("ldai_optimizer.client.LDApiClient") as mock_api_cls: + mock_api_instance = MagicMock() + mock_api_instance.get_ai_config.return_value = ai_config + mock_api_instance.get_model_configs.return_value = [] + mock_api_cls.return_value = mock_api_instance + + options = _make_options( + variation_key="my-variation", + project_key="my-project", + ) + await client.optimize_from_options("test-agent", options) + + mock_api_instance.get_ai_config.assert_called_with("my-project", "test-agent") + # Verify that the SDK default variation() was NOT called + client._ldClient._client.variation.assert_not_called() + + async def test_raises_when_variation_key_not_found_in_config(self): + client = self._make_client_with_key() + ai_config = self._make_ai_config_with_variations("v1", "v2") + + with patch("ldai_optimizer.client.LDApiClient") as mock_api_cls: + mock_api_instance = MagicMock() + mock_api_instance.get_ai_config.return_value = ai_config + mock_api_instance.get_model_configs.return_value = [] + mock_api_cls.return_value = mock_api_instance + + options = _make_options( + variation_key="nonexistent-key", + project_key="my-project", + ) + with pytest.raises(ValueError, match="nonexistent-key"): + await client.optimize_from_options("test-agent", options) + + async def test_no_api_call_when_variation_key_not_set(self): + client = self._make_client_without_key() + options = _make_options() # no variation_key + + # Should succeed and use the SDK default variation path + result = await client.optimize_from_options("test-agent", options) + client._ldClient._client.variation.assert_called() + assert result is not None + + +# --------------------------------------------------------------------------- +# variation_key in optimize_from_ground_truth_options +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +class TestVariationKeyInOptimizeFromGroundTruthOptions: + def _make_client_with_key(self) -> OptimizationClient: + with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-api-key"}): + return OptimizationClient(_make_ldai_client()) + + def _make_client_without_key(self) -> OptimizationClient: + client = OptimizationClient(_make_ldai_client()) + client._has_api_key = False + client._api_key = None + return client + + async def test_raises_when_variation_key_set_and_no_api_key(self): + client = self._make_client_without_key() + options = _make_gt_options(variation_key="my-variation", project_key="my-project") + + with pytest.raises(ValueError, match="LAUNCHDARKLY_API_KEY"): + await client.optimize_from_ground_truth_options("test-agent", options) + + async def test_raises_when_variation_key_set_and_no_project_key(self): + client = self._make_client_with_key() + options = _make_gt_options(variation_key="my-variation", project_key=None) + + with pytest.raises(ValueError, match="project_key"): + await client.optimize_from_ground_truth_options("test-agent", options) + + async def test_raises_when_variation_key_not_found_in_config(self): + client = self._make_client_with_key() + ai_config = {"variations": [{"key": "v1"}, {"key": "v2"}]} + + with patch("ldai_optimizer.client.LDApiClient") as mock_api_cls: + mock_api_instance = MagicMock() + mock_api_instance.get_ai_config.return_value = ai_config + mock_api_instance.get_model_configs.return_value = [] + mock_api_cls.return_value = mock_api_instance + + options = _make_gt_options( + variation_key="nonexistent-key", + project_key="my-project", + ) + with pytest.raises(ValueError, match="nonexistent-key"): + await client.optimize_from_ground_truth_options("test-agent", options) + + +# --------------------------------------------------------------------------- +# latency_optimization / token_optimization boolean controls +# --------------------------------------------------------------------------- + + +class TestLatencyCostOptimizationBooleans: + """Verify that latency_optimization and token_optimization booleans directly + control gate and prompt behaviour, replacing the old regex approach.""" + + def setup_method(self): + self.client = _make_client() + self.client._initialize_class_members_from_config(_make_agent_config()) + self.client._baseline_duration_ms = 1000.0 + self.client._baseline_cost_usd = 0.010 + + def _ctx(self, duration_ms=500.0, cost=0.005, iteration=2): + return OptimizationContext( + scores={}, + completion_response="response", + current_instructions="Do X.", + current_parameters={}, + current_variables={}, + iteration=iteration, + duration_ms=duration_ms, + estimated_cost_usd=cost, + ) + + def test_latency_gate_active_when_true(self): + self.client._options = _make_options(latency_optimization=True) + _, updated = self.client._apply_duration_gate(True, self._ctx(duration_ms=500.0)) + assert "_latency_gate" in updated.scores + + def test_latency_gate_inactive_when_none(self): + self.client._options = _make_options(latency_optimization=None) + _, updated = self.client._apply_duration_gate(True, self._ctx(duration_ms=500.0)) + assert "_latency_gate" not in updated.scores + + def test_latency_gate_inactive_when_false(self): + self.client._options = _make_options(latency_optimization=False) + _, updated = self.client._apply_duration_gate(True, self._ctx(duration_ms=500.0)) + assert "_latency_gate" not in updated.scores + + def test_cost_gate_active_when_true(self): + self.client._options = _make_options(token_optimization=True) + _, updated = self.client._apply_cost_gate(True, self._ctx(cost=0.005)) + assert "_cost_gate" in updated.scores + + def test_cost_gate_inactive_when_none(self): + self.client._options = _make_options(token_optimization=None) + _, updated = self.client._apply_cost_gate(True, self._ctx(cost=0.005)) + assert "_cost_gate" not in updated.scores + + def test_cost_gate_inactive_when_false(self): + self.client._options = _make_options(token_optimization=False) + _, updated = self.client._apply_cost_gate(True, self._ctx(cost=0.005)) + assert "_cost_gate" not in updated.scores + + def test_both_gates_independent(self): + """latency_optimization=True, token_optimization=False → only latency gate fires.""" + self.client._options = _make_options(latency_optimization=True, token_optimization=False) + ctx = self._ctx() + _, ctx = self.client._apply_duration_gate(True, ctx) + _, ctx = self.client._apply_cost_gate(True, ctx) + assert "_latency_gate" in ctx.scores + assert "_cost_gate" not in ctx.scores