Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 63 additions & 6 deletions packages/optimization/src/ldai_optimizer/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import logging
import os
import random
import re
import time
import uuid
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
Expand Down Expand Up @@ -69,6 +70,15 @@
logger.addFilter(RedactionFilter())


def _interpolate(template: str, variables: Dict[str, Any]) -> str:
"""Replace {{key}} tokens with values from variables; unresolved tokens become empty string."""
return re.sub(
r"\{\{(\w+)\}\}",
lambda m: str(variables.get(m.group(1), "")),
template,
)
Comment thread
andrewklatzke marked this conversation as resolved.


def _find_model_config(
model_name: str, configs: List[Dict[str, Any]]
) -> Optional[Dict[str, Any]]:
Expand Down Expand Up @@ -402,18 +412,65 @@ def _judge_config(
variables: Dict[str, Any],
) -> AIJudgeConfig:
"""
Fetch a judge configuration from the LaunchDarkly client.
Fetch a judge configuration by evaluating the flag variation directly.

Thin wrapper around LDAIClient.judge_config so callers do not need a
direct reference to the client.
Bypasses LDAIClient.judge_config to avoid the reserved-variable warnings
for 'message_history' and 'response_to_evaluate'. Those variables are
interpolated here with their actual values instead of being neutralised
by the SDK. If the template contains only a system message, a user turn
is synthesised from the provided message_history and response_to_evaluate
so that _evaluate_config_judge always receives a complete conversation.

:param judge_key: The key for the judge configuration in LaunchDarkly
:param context: The evaluation context
:param default: Fallback config when the flag is disabled or unreachable
:param variables: Template variables for instruction interpolation
:param default: Unused; kept for signature compatibility
:param variables: Template variables including message_history and response_to_evaluate
:return: The resolved AIJudgeConfig
"""
return self._ldClient.judge_config(judge_key, context, default, variables)
variation: Dict[str, Any] = self._ldClient._client.variation(judge_key, context, {})
enabled: bool = bool(variation.get("_ldMeta", {}).get("enabled", False))

all_variables: Dict[str, Any] = {"ldctx": context.to_dict(), **variables}

messages: List[LDMessage] = []
raw_messages = variation.get("messages")
if isinstance(raw_messages, list) and all(isinstance(m, dict) for m in raw_messages):
messages = [
LDMessage(
role=m["role"],
content=_interpolate(m.get("content", ""), all_variables),
)
for m in raw_messages
]

# New-style templates only have a system message. Auto-generate a user
# turn so _evaluate_config_judge always has a complete conversation to split.
if not any(m.role == "user" for m in messages):
message_history = variables.get("message_history", "")
response_to_evaluate = variables.get("response_to_evaluate", "")
parts: List[str] = []
if message_history:
parts.append(str(message_history))
parts.append(f"Here is the response to evaluate: {response_to_evaluate}")
messages.append(LDMessage(role="user", content="\n\n".join(parts)))

model: Optional[ModelConfig] = None
raw_model = variation.get("model")
if isinstance(raw_model, dict):
model = ModelConfig(
name=raw_model.get("name", ""),
parameters=raw_model.get("parameters"),
custom=raw_model.get("custom"),
)

return AIJudgeConfig(
key=judge_key,
enabled=enabled,
create_tracker=lambda: None,
model=model,
messages=messages,
evaluation_metric_key=variation.get("evaluationMetricKey"),
Comment thread
andrewklatzke marked this conversation as resolved.
)

def _serialize_scores(
self, judge_results: Dict[str, JudgeResult]
Expand Down
111 changes: 69 additions & 42 deletions packages/optimization/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from unittest.mock import AsyncMock, MagicMock, patch

import pytest
from ldai import AIAgentConfig, AIJudgeConfig, LDAIClient
from ldai import AIAgentConfig, LDAIClient
from ldai.client import Evaluator
from ldai.models import LDMessage, ModelConfig
from ldai.tracker import TokenUsage
Expand Down Expand Up @@ -717,20 +717,19 @@ def setup_method(self):
self.handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
self.client._options = _make_options(handle_judge_call=self.handle_judge_call)

def _make_judge_config(self, enabled: bool = True) -> AIJudgeConfig:
return AIJudgeConfig(
key="ld-judge-key",
enabled=enabled,
create_tracker=MagicMock,
model=ModelConfig(name="gpt-4o", parameters={}),
messages=[
LDMessage(role="system", content="You are an evaluator."),
LDMessage(role="user", content="Evaluate this response."),
def _make_raw_variation(self, enabled: bool = True) -> Dict[str, Any]:
"""Raw variation dict as returned by _client.variation for a judge flag."""
return {
"_ldMeta": {"enabled": enabled},
"messages": [
{"role": "system", "content": "You are an evaluator."},
{"role": "user", "content": "Evaluate this response."},
],
)
"model": {"name": "gpt-4o", "parameters": {}},
}

async def test_calls_handle_judge_call_with_correct_config_type(self):
self.mock_ldai.judge_config.return_value = self._make_judge_config()
self.mock_ldai._client.variation.return_value = self._make_raw_variation()
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
await self.client._evaluate_config_judge(
judge_key="quality",
Expand All @@ -748,7 +747,7 @@ async def test_calls_handle_judge_call_with_correct_config_type(self):
assert isinstance(ctx, OptimizationJudgeContext)

async def test_messages_has_system_and_user_turns(self):
self.mock_ldai.judge_config.return_value = self._make_judge_config()
self.mock_ldai._client.variation.return_value = self._make_raw_variation()
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
await self.client._evaluate_config_judge(
judge_key="quality",
Expand All @@ -763,7 +762,7 @@ async def test_messages_has_system_and_user_turns(self):
assert roles == ["system", "user"]

async def test_messages_system_content_matches_instructions(self):
self.mock_ldai.judge_config.return_value = self._make_judge_config()
self.mock_ldai._client.variation.return_value = self._make_raw_variation()
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
await self.client._evaluate_config_judge(
judge_key="quality",
Expand All @@ -778,7 +777,7 @@ async def test_messages_system_content_matches_instructions(self):
assert system_msg.content == config.instructions

async def test_messages_user_content_matches_context_user_input(self):
self.mock_ldai.judge_config.return_value = self._make_judge_config()
self.mock_ldai._client.variation.return_value = self._make_raw_variation()
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
await self.client._evaluate_config_judge(
judge_key="quality",
Expand All @@ -793,7 +792,7 @@ async def test_messages_user_content_matches_context_user_input(self):
assert user_msg.content == ctx.user_input

async def test_messages_user_content_contains_ld_user_message(self):
self.mock_ldai.judge_config.return_value = self._make_judge_config()
self.mock_ldai._client.variation.return_value = self._make_raw_variation()
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
await self.client._evaluate_config_judge(
judge_key="quality",
Expand All @@ -808,7 +807,7 @@ async def test_messages_user_content_contains_ld_user_message(self):
assert "Evaluate this response." in user_msg.content

async def test_returns_zero_score_when_judge_disabled(self):
self.mock_ldai.judge_config.return_value = self._make_judge_config(enabled=False)
self.mock_ldai._client.variation.return_value = self._make_raw_variation(enabled=False)
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
result = await self.client._evaluate_config_judge(
judge_key="quality",
Expand All @@ -821,48 +820,76 @@ async def test_returns_zero_score_when_judge_disabled(self):
assert result.score == 0.0
self.handle_judge_call.assert_not_called()

async def test_returns_zero_score_when_judge_has_no_messages(self):
judge_config = AIJudgeConfig(
key="ld-judge-key",
enabled=True,
create_tracker=MagicMock,
model=ModelConfig(name="gpt-4o", parameters={}),
messages=None,
)
self.mock_ldai.judge_config.return_value = judge_config
async def test_system_only_template_auto_generates_user_message(self):
"""When the flag template has only a system message, a user turn is synthesised."""
self.mock_ldai._client.variation.return_value = {
"_ldMeta": {"enabled": True},
"messages": [{"role": "system", "content": "You are an evaluator."}],
"model": {"name": "gpt-4o", "parameters": {}},
}
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
result = await self.client._evaluate_config_judge(
await self.client._evaluate_config_judge(
judge_key="quality",
optimization_judge=judge,
completion_response="Any.",
completion_response="The answer is 42.",
iteration=1,
reasoning_history="",
user_input="Anything?",
user_input="What is the answer?",
)
assert result.score == 0.0
self.handle_judge_call.assert_not_called()

async def test_template_variables_merged_into_judge_config_call(self):
self.mock_ldai.judge_config.return_value = self._make_judge_config()
_, config, _, _ = self.handle_judge_call.call_args.args
user_msg = next(m for m in config.messages if m.role == "user")
assert "The answer is 42." in user_msg.content

async def test_template_variables_interpolated_into_messages(self):
"""Custom agent variables are interpolated into judge template messages."""
self.mock_ldai._client.variation.return_value = {
"_ldMeta": {"enabled": True},
"messages": [
{"role": "system", "content": "Evaluate in {{language}}."},
{"role": "user", "content": "Evaluate this response."},
],
"model": {"name": "gpt-4o", "parameters": {}},
}
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
variables = {"language": "Spanish"}
await self.client._evaluate_config_judge(
judge_key="quality",
optimization_judge=judge,
completion_response="Answer.",
iteration=1,
reasoning_history="",
user_input="Q?",
variables=variables,
variables={"language": "Spanish"},
)
call_kwargs = self.mock_ldai.judge_config.call_args
passed_vars = call_kwargs.args[3] if call_kwargs.args else call_kwargs.kwargs.get("variables", {})
assert passed_vars.get("language") == "Spanish"
assert "message_history" in passed_vars
assert "response_to_evaluate" in passed_vars
_, config, _, _ = self.handle_judge_call.call_args.args
assert "Spanish" in config.instructions

async def test_reserved_variables_interpolated_into_template_messages(self):
"""message_history and response_to_evaluate are interpolated when present in the template."""
self.mock_ldai._client.variation.return_value = {
"_ldMeta": {"enabled": True},
"messages": [
{"role": "system", "content": "History: {{message_history}}"},
{"role": "user", "content": "Response: {{response_to_evaluate}}"},
],
"model": {"name": "gpt-4o", "parameters": {}},
}
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
await self.client._evaluate_config_judge(
judge_key="quality",
optimization_judge=judge,
completion_response="My answer.",
iteration=1,
reasoning_history="",
user_input="Q?",
)
_, config, _, _ = self.handle_judge_call.call_args.args
system_msg = next(m for m in config.messages if m.role == "system")
assert "History:" in system_msg.content
user_msg = next(m for m in config.messages if m.role == "user")
assert "My answer." in user_msg.content

async def test_agent_tools_included_without_evaluation_tool(self):
self.mock_ldai.judge_config.return_value = self._make_judge_config()
self.mock_ldai._client.variation.return_value = self._make_raw_variation()
agent_tool = ToolDefinition(name="search", description="Search", input_schema={})
judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
await self.client._evaluate_config_judge(
Expand Down
Loading