From 806b564e7f4de8d3914e187c6072709b24f1526b Mon Sep 17 00:00:00 2001 From: Andrew Klatzke Date: Mon, 1 Jun 2026 15:15:15 -0800 Subject: [PATCH] circumvents judge_config calls to make judge evaluations in optimization --- .../optimization/src/ldai_optimizer/client.py | 69 ++++++++++- packages/optimization/tests/test_client.py | 111 +++++++++++------- 2 files changed, 132 insertions(+), 48 deletions(-) diff --git a/packages/optimization/src/ldai_optimizer/client.py b/packages/optimization/src/ldai_optimizer/client.py index bf82139a..dc10afcb 100644 --- a/packages/optimization/src/ldai_optimizer/client.py +++ b/packages/optimization/src/ldai_optimizer/client.py @@ -20,6 +20,7 @@ import logging import os import random +import re import time import uuid from typing import Any, Dict, List, Literal, Optional, Tuple, Union @@ -69,6 +70,15 @@ logger.addFilter(RedactionFilter()) +def _interpolate(template: str, variables: Dict[str, Any]) -> str: + """Replace {{key}} tokens with values from variables; unresolved tokens become empty string.""" + return re.sub( + r"\{\{(\w+)\}\}", + lambda m: str(variables.get(m.group(1), "")), + template, + ) + + def _find_model_config( model_name: str, configs: List[Dict[str, Any]] ) -> Optional[Dict[str, Any]]: @@ -402,18 +412,65 @@ def _judge_config( variables: Dict[str, Any], ) -> AIJudgeConfig: """ - Fetch a judge configuration from the LaunchDarkly client. + Fetch a judge configuration by evaluating the flag variation directly. - Thin wrapper around LDAIClient.judge_config so callers do not need a - direct reference to the client. + Bypasses LDAIClient.judge_config to avoid the reserved-variable warnings + for 'message_history' and 'response_to_evaluate'. Those variables are + interpolated here with their actual values instead of being neutralised + by the SDK. If the template contains only a system message, a user turn + is synthesised from the provided message_history and response_to_evaluate + so that _evaluate_config_judge always receives a complete conversation. :param judge_key: The key for the judge configuration in LaunchDarkly :param context: The evaluation context - :param default: Fallback config when the flag is disabled or unreachable - :param variables: Template variables for instruction interpolation + :param default: Unused; kept for signature compatibility + :param variables: Template variables including message_history and response_to_evaluate :return: The resolved AIJudgeConfig """ - return self._ldClient.judge_config(judge_key, context, default, variables) + variation: Dict[str, Any] = self._ldClient._client.variation(judge_key, context, {}) + enabled: bool = bool(variation.get("_ldMeta", {}).get("enabled", False)) + + all_variables: Dict[str, Any] = {"ldctx": context.to_dict(), **variables} + + messages: List[LDMessage] = [] + raw_messages = variation.get("messages") + if isinstance(raw_messages, list) and all(isinstance(m, dict) for m in raw_messages): + messages = [ + LDMessage( + role=m["role"], + content=_interpolate(m.get("content", ""), all_variables), + ) + for m in raw_messages + ] + + # New-style templates only have a system message. Auto-generate a user + # turn so _evaluate_config_judge always has a complete conversation to split. + if not any(m.role == "user" for m in messages): + message_history = variables.get("message_history", "") + response_to_evaluate = variables.get("response_to_evaluate", "") + parts: List[str] = [] + if message_history: + parts.append(str(message_history)) + parts.append(f"Here is the response to evaluate: {response_to_evaluate}") + messages.append(LDMessage(role="user", content="\n\n".join(parts))) + + model: Optional[ModelConfig] = None + raw_model = variation.get("model") + if isinstance(raw_model, dict): + model = ModelConfig( + name=raw_model.get("name", ""), + parameters=raw_model.get("parameters"), + custom=raw_model.get("custom"), + ) + + return AIJudgeConfig( + key=judge_key, + enabled=enabled, + create_tracker=lambda: None, + model=model, + messages=messages, + evaluation_metric_key=variation.get("evaluationMetricKey"), + ) def _serialize_scores( self, judge_results: Dict[str, JudgeResult] diff --git a/packages/optimization/tests/test_client.py b/packages/optimization/tests/test_client.py index 344a0584..099d2a58 100644 --- a/packages/optimization/tests/test_client.py +++ b/packages/optimization/tests/test_client.py @@ -6,7 +6,7 @@ from unittest.mock import AsyncMock, MagicMock, patch import pytest -from ldai import AIAgentConfig, AIJudgeConfig, LDAIClient +from ldai import AIAgentConfig, LDAIClient from ldai.client import Evaluator from ldai.models import LDMessage, ModelConfig from ldai.tracker import TokenUsage @@ -717,20 +717,19 @@ def setup_method(self): self.handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)) self.client._options = _make_options(handle_judge_call=self.handle_judge_call) - def _make_judge_config(self, enabled: bool = True) -> AIJudgeConfig: - return AIJudgeConfig( - key="ld-judge-key", - enabled=enabled, - create_tracker=MagicMock, - model=ModelConfig(name="gpt-4o", parameters={}), - messages=[ - LDMessage(role="system", content="You are an evaluator."), - LDMessage(role="user", content="Evaluate this response."), + def _make_raw_variation(self, enabled: bool = True) -> Dict[str, Any]: + """Raw variation dict as returned by _client.variation for a judge flag.""" + return { + "_ldMeta": {"enabled": enabled}, + "messages": [ + {"role": "system", "content": "You are an evaluator."}, + {"role": "user", "content": "Evaluate this response."}, ], - ) + "model": {"name": "gpt-4o", "parameters": {}}, + } async def test_calls_handle_judge_call_with_correct_config_type(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + self.mock_ldai._client.variation.return_value = self._make_raw_variation() judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge( judge_key="quality", @@ -748,7 +747,7 @@ async def test_calls_handle_judge_call_with_correct_config_type(self): assert isinstance(ctx, OptimizationJudgeContext) async def test_messages_has_system_and_user_turns(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + self.mock_ldai._client.variation.return_value = self._make_raw_variation() judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge( judge_key="quality", @@ -763,7 +762,7 @@ async def test_messages_has_system_and_user_turns(self): assert roles == ["system", "user"] async def test_messages_system_content_matches_instructions(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + self.mock_ldai._client.variation.return_value = self._make_raw_variation() judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge( judge_key="quality", @@ -778,7 +777,7 @@ async def test_messages_system_content_matches_instructions(self): assert system_msg.content == config.instructions async def test_messages_user_content_matches_context_user_input(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + self.mock_ldai._client.variation.return_value = self._make_raw_variation() judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge( judge_key="quality", @@ -793,7 +792,7 @@ async def test_messages_user_content_matches_context_user_input(self): assert user_msg.content == ctx.user_input async def test_messages_user_content_contains_ld_user_message(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + self.mock_ldai._client.variation.return_value = self._make_raw_variation() judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge( judge_key="quality", @@ -808,7 +807,7 @@ async def test_messages_user_content_contains_ld_user_message(self): assert "Evaluate this response." in user_msg.content async def test_returns_zero_score_when_judge_disabled(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config(enabled=False) + self.mock_ldai._client.variation.return_value = self._make_raw_variation(enabled=False) judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") result = await self.client._evaluate_config_judge( judge_key="quality", @@ -821,31 +820,37 @@ async def test_returns_zero_score_when_judge_disabled(self): assert result.score == 0.0 self.handle_judge_call.assert_not_called() - async def test_returns_zero_score_when_judge_has_no_messages(self): - judge_config = AIJudgeConfig( - key="ld-judge-key", - enabled=True, - create_tracker=MagicMock, - model=ModelConfig(name="gpt-4o", parameters={}), - messages=None, - ) - self.mock_ldai.judge_config.return_value = judge_config + async def test_system_only_template_auto_generates_user_message(self): + """When the flag template has only a system message, a user turn is synthesised.""" + self.mock_ldai._client.variation.return_value = { + "_ldMeta": {"enabled": True}, + "messages": [{"role": "system", "content": "You are an evaluator."}], + "model": {"name": "gpt-4o", "parameters": {}}, + } judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") - result = await self.client._evaluate_config_judge( + await self.client._evaluate_config_judge( judge_key="quality", optimization_judge=judge, - completion_response="Any.", + completion_response="The answer is 42.", iteration=1, reasoning_history="", - user_input="Anything?", + user_input="What is the answer?", ) - assert result.score == 0.0 - self.handle_judge_call.assert_not_called() - - async def test_template_variables_merged_into_judge_config_call(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + _, config, _, _ = self.handle_judge_call.call_args.args + user_msg = next(m for m in config.messages if m.role == "user") + assert "The answer is 42." in user_msg.content + + async def test_template_variables_interpolated_into_messages(self): + """Custom agent variables are interpolated into judge template messages.""" + self.mock_ldai._client.variation.return_value = { + "_ldMeta": {"enabled": True}, + "messages": [ + {"role": "system", "content": "Evaluate in {{language}}."}, + {"role": "user", "content": "Evaluate this response."}, + ], + "model": {"name": "gpt-4o", "parameters": {}}, + } judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") - variables = {"language": "Spanish"} await self.client._evaluate_config_judge( judge_key="quality", optimization_judge=judge, @@ -853,16 +858,38 @@ async def test_template_variables_merged_into_judge_config_call(self): iteration=1, reasoning_history="", user_input="Q?", - variables=variables, + variables={"language": "Spanish"}, ) - call_kwargs = self.mock_ldai.judge_config.call_args - passed_vars = call_kwargs.args[3] if call_kwargs.args else call_kwargs.kwargs.get("variables", {}) - assert passed_vars.get("language") == "Spanish" - assert "message_history" in passed_vars - assert "response_to_evaluate" in passed_vars + _, config, _, _ = self.handle_judge_call.call_args.args + assert "Spanish" in config.instructions + + async def test_reserved_variables_interpolated_into_template_messages(self): + """message_history and response_to_evaluate are interpolated when present in the template.""" + self.mock_ldai._client.variation.return_value = { + "_ldMeta": {"enabled": True}, + "messages": [ + {"role": "system", "content": "History: {{message_history}}"}, + {"role": "user", "content": "Response: {{response_to_evaluate}}"}, + ], + "model": {"name": "gpt-4o", "parameters": {}}, + } + judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") + await self.client._evaluate_config_judge( + judge_key="quality", + optimization_judge=judge, + completion_response="My answer.", + iteration=1, + reasoning_history="", + user_input="Q?", + ) + _, config, _, _ = self.handle_judge_call.call_args.args + system_msg = next(m for m in config.messages if m.role == "system") + assert "History:" in system_msg.content + user_msg = next(m for m in config.messages if m.role == "user") + assert "My answer." in user_msg.content async def test_agent_tools_included_without_evaluation_tool(self): - self.mock_ldai.judge_config.return_value = self._make_judge_config() + self.mock_ldai._client.variation.return_value = self._make_raw_variation() agent_tool = ToolDefinition(name="search", description="Search", input_schema={}) judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key") await self.client._evaluate_config_judge(