From 806b564e7f4de8d3914e187c6072709b24f1526b Mon Sep 17 00:00:00 2001
From: Andrew Klatzke <aklatzke@launchdarkly.com>
Date: Mon, 1 Jun 2026 15:15:15 -0800
Subject: [PATCH] circumvents judge_config calls to make judge evaluations in
 optimization

---
 .../optimization/src/ldai_optimizer/client.py |  69 ++++++++++-
 packages/optimization/tests/test_client.py    | 111 +++++++++++-------
 2 files changed, 132 insertions(+), 48 deletions(-)

diff --git a/packages/optimization/src/ldai_optimizer/client.py b/packages/optimization/src/ldai_optimizer/client.py
index bf82139a..dc10afcb 100644
--- a/packages/optimization/src/ldai_optimizer/client.py
+++ b/packages/optimization/src/ldai_optimizer/client.py
@@ -20,6 +20,7 @@
 import logging
 import os
 import random
+import re
 import time
 import uuid
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
@@ -69,6 +70,15 @@
 logger.addFilter(RedactionFilter())
 
 
+def _interpolate(template: str, variables: Dict[str, Any]) -> str:
+    """Replace {{key}} tokens with values from variables; unresolved tokens become empty string."""
+    return re.sub(
+        r"\{\{(\w+)\}\}",
+        lambda m: str(variables.get(m.group(1), "")),
+        template,
+    )
+
+
 def _find_model_config(
     model_name: str, configs: List[Dict[str, Any]]
 ) -> Optional[Dict[str, Any]]:
@@ -402,18 +412,65 @@ def _judge_config(
         variables: Dict[str, Any],
     ) -> AIJudgeConfig:
         """
-        Fetch a judge configuration from the LaunchDarkly client.
+        Fetch a judge configuration by evaluating the flag variation directly.
 
-        Thin wrapper around LDAIClient.judge_config so callers do not need a
-        direct reference to the client.
+        Bypasses LDAIClient.judge_config to avoid the reserved-variable warnings
+        for 'message_history' and 'response_to_evaluate'. Those variables are
+        interpolated here with their actual values instead of being neutralised
+        by the SDK. If the template contains only a system message, a user turn
+        is synthesised from the provided message_history and response_to_evaluate
+        so that _evaluate_config_judge always receives a complete conversation.
 
         :param judge_key: The key for the judge configuration in LaunchDarkly
         :param context: The evaluation context
-        :param default: Fallback config when the flag is disabled or unreachable
-        :param variables: Template variables for instruction interpolation
+        :param default: Unused; kept for signature compatibility
+        :param variables: Template variables including message_history and response_to_evaluate
         :return: The resolved AIJudgeConfig
         """
-        return self._ldClient.judge_config(judge_key, context, default, variables)
+        variation: Dict[str, Any] = self._ldClient._client.variation(judge_key, context, {})
+        enabled: bool = bool(variation.get("_ldMeta", {}).get("enabled", False))
+
+        all_variables: Dict[str, Any] = {"ldctx": context.to_dict(), **variables}
+
+        messages: List[LDMessage] = []
+        raw_messages = variation.get("messages")
+        if isinstance(raw_messages, list) and all(isinstance(m, dict) for m in raw_messages):
+            messages = [
+                LDMessage(
+                    role=m["role"],
+                    content=_interpolate(m.get("content", ""), all_variables),
+                )
+                for m in raw_messages
+            ]
+
+        # New-style templates only have a system message. Auto-generate a user
+        # turn so _evaluate_config_judge always has a complete conversation to split.
+        if not any(m.role == "user" for m in messages):
+            message_history = variables.get("message_history", "")
+            response_to_evaluate = variables.get("response_to_evaluate", "")
+            parts: List[str] = []
+            if message_history:
+                parts.append(str(message_history))
+            parts.append(f"Here is the response to evaluate: {response_to_evaluate}")
+            messages.append(LDMessage(role="user", content="\n\n".join(parts)))
+
+        model: Optional[ModelConfig] = None
+        raw_model = variation.get("model")
+        if isinstance(raw_model, dict):
+            model = ModelConfig(
+                name=raw_model.get("name", ""),
+                parameters=raw_model.get("parameters"),
+                custom=raw_model.get("custom"),
+            )
+
+        return AIJudgeConfig(
+            key=judge_key,
+            enabled=enabled,
+            create_tracker=lambda: None,
+            model=model,
+            messages=messages,
+            evaluation_metric_key=variation.get("evaluationMetricKey"),
+        )
 
     def _serialize_scores(
         self, judge_results: Dict[str, JudgeResult]
diff --git a/packages/optimization/tests/test_client.py b/packages/optimization/tests/test_client.py
index 344a0584..099d2a58 100644
--- a/packages/optimization/tests/test_client.py
+++ b/packages/optimization/tests/test_client.py
@@ -6,7 +6,7 @@
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
-from ldai import AIAgentConfig, AIJudgeConfig, LDAIClient
+from ldai import AIAgentConfig, LDAIClient
 from ldai.client import Evaluator
 from ldai.models import LDMessage, ModelConfig
 from ldai.tracker import TokenUsage
@@ -717,20 +717,19 @@ def setup_method(self):
         self.handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
         self.client._options = _make_options(handle_judge_call=self.handle_judge_call)
 
-    def _make_judge_config(self, enabled: bool = True) -> AIJudgeConfig:
-        return AIJudgeConfig(
-            key="ld-judge-key",
-            enabled=enabled,
-            create_tracker=MagicMock,
-            model=ModelConfig(name="gpt-4o", parameters={}),
-            messages=[
-                LDMessage(role="system", content="You are an evaluator."),
-                LDMessage(role="user", content="Evaluate this response."),
+    def _make_raw_variation(self, enabled: bool = True) -> Dict[str, Any]:
+        """Raw variation dict as returned by _client.variation for a judge flag."""
+        return {
+            "_ldMeta": {"enabled": enabled},
+            "messages": [
+                {"role": "system", "content": "You are an evaluator."},
+                {"role": "user", "content": "Evaluate this response."},
             ],
-        )
+            "model": {"name": "gpt-4o", "parameters": {}},
+        }
 
     async def test_calls_handle_judge_call_with_correct_config_type(self):
-        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        self.mock_ldai._client.variation.return_value = self._make_raw_variation()
         judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
         await self.client._evaluate_config_judge(
             judge_key="quality",
@@ -748,7 +747,7 @@ async def test_calls_handle_judge_call_with_correct_config_type(self):
         assert isinstance(ctx, OptimizationJudgeContext)
 
     async def test_messages_has_system_and_user_turns(self):
-        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        self.mock_ldai._client.variation.return_value = self._make_raw_variation()
         judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
         await self.client._evaluate_config_judge(
             judge_key="quality",
@@ -763,7 +762,7 @@ async def test_messages_has_system_and_user_turns(self):
         assert roles == ["system", "user"]
 
     async def test_messages_system_content_matches_instructions(self):
-        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        self.mock_ldai._client.variation.return_value = self._make_raw_variation()
         judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
         await self.client._evaluate_config_judge(
             judge_key="quality",
@@ -778,7 +777,7 @@ async def test_messages_system_content_matches_instructions(self):
         assert system_msg.content == config.instructions
 
     async def test_messages_user_content_matches_context_user_input(self):
-        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        self.mock_ldai._client.variation.return_value = self._make_raw_variation()
         judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
         await self.client._evaluate_config_judge(
             judge_key="quality",
@@ -793,7 +792,7 @@ async def test_messages_user_content_matches_context_user_input(self):
         assert user_msg.content == ctx.user_input
 
     async def test_messages_user_content_contains_ld_user_message(self):
-        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        self.mock_ldai._client.variation.return_value = self._make_raw_variation()
         judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
         await self.client._evaluate_config_judge(
             judge_key="quality",
@@ -808,7 +807,7 @@ async def test_messages_user_content_contains_ld_user_message(self):
         assert "Evaluate this response." in user_msg.content
 
     async def test_returns_zero_score_when_judge_disabled(self):
-        self.mock_ldai.judge_config.return_value = self._make_judge_config(enabled=False)
+        self.mock_ldai._client.variation.return_value = self._make_raw_variation(enabled=False)
         judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
         result = await self.client._evaluate_config_judge(
             judge_key="quality",
@@ -821,31 +820,37 @@ async def test_returns_zero_score_when_judge_disabled(self):
         assert result.score == 0.0
         self.handle_judge_call.assert_not_called()
 
-    async def test_returns_zero_score_when_judge_has_no_messages(self):
-        judge_config = AIJudgeConfig(
-            key="ld-judge-key",
-            enabled=True,
-            create_tracker=MagicMock,
-            model=ModelConfig(name="gpt-4o", parameters={}),
-            messages=None,
-        )
-        self.mock_ldai.judge_config.return_value = judge_config
+    async def test_system_only_template_auto_generates_user_message(self):
+        """When the flag template has only a system message, a user turn is synthesised."""
+        self.mock_ldai._client.variation.return_value = {
+            "_ldMeta": {"enabled": True},
+            "messages": [{"role": "system", "content": "You are an evaluator."}],
+            "model": {"name": "gpt-4o", "parameters": {}},
+        }
         judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
-        result = await self.client._evaluate_config_judge(
+        await self.client._evaluate_config_judge(
             judge_key="quality",
             optimization_judge=judge,
-            completion_response="Any.",
+            completion_response="The answer is 42.",
             iteration=1,
             reasoning_history="",
-            user_input="Anything?",
+            user_input="What is the answer?",
         )
-        assert result.score == 0.0
-        self.handle_judge_call.assert_not_called()
-
-    async def test_template_variables_merged_into_judge_config_call(self):
-        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        user_msg = next(m for m in config.messages if m.role == "user")
+        assert "The answer is 42." in user_msg.content
+
+    async def test_template_variables_interpolated_into_messages(self):
+        """Custom agent variables are interpolated into judge template messages."""
+        self.mock_ldai._client.variation.return_value = {
+            "_ldMeta": {"enabled": True},
+            "messages": [
+                {"role": "system", "content": "Evaluate in {{language}}."},
+                {"role": "user", "content": "Evaluate this response."},
+            ],
+            "model": {"name": "gpt-4o", "parameters": {}},
+        }
         judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
-        variables = {"language": "Spanish"}
         await self.client._evaluate_config_judge(
             judge_key="quality",
             optimization_judge=judge,
@@ -853,16 +858,38 @@ async def test_template_variables_merged_into_judge_config_call(self):
             iteration=1,
             reasoning_history="",
             user_input="Q?",
-            variables=variables,
+            variables={"language": "Spanish"},
         )
-        call_kwargs = self.mock_ldai.judge_config.call_args
-        passed_vars = call_kwargs.args[3] if call_kwargs.args else call_kwargs.kwargs.get("variables", {})
-        assert passed_vars.get("language") == "Spanish"
-        assert "message_history" in passed_vars
-        assert "response_to_evaluate" in passed_vars
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        assert "Spanish" in config.instructions
+
+    async def test_reserved_variables_interpolated_into_template_messages(self):
+        """message_history and response_to_evaluate are interpolated when present in the template."""
+        self.mock_ldai._client.variation.return_value = {
+            "_ldMeta": {"enabled": True},
+            "messages": [
+                {"role": "system", "content": "History: {{message_history}}"},
+                {"role": "user", "content": "Response: {{response_to_evaluate}}"},
+            ],
+            "model": {"name": "gpt-4o", "parameters": {}},
+        }
+        judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+        await self.client._evaluate_config_judge(
+            judge_key="quality",
+            optimization_judge=judge,
+            completion_response="My answer.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Q?",
+        )
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        system_msg = next(m for m in config.messages if m.role == "system")
+        assert "History:" in system_msg.content
+        user_msg = next(m for m in config.messages if m.role == "user")
+        assert "My answer." in user_msg.content
 
     async def test_agent_tools_included_without_evaluation_tool(self):
-        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        self.mock_ldai._client.variation.return_value = self._make_raw_variation()
         agent_tool = ToolDefinition(name="search", description="Search", input_schema={})
         judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
         await self.client._evaluate_config_judge(