feat: Update litellm dependency constraints and enhance pricing functionality

bradhilton · bradhilton · commit be1f6aae594c · 2026-03-25T12:28:16.000-06:00
- Updated litellm dependency in pyproject.toml and uv.lock to restrict versions between 1.71.1 and 1.82.0.
- Added new functions in api_costs.py to retrieve and normalize token pricing from litellm.
- Implemented a fallback mechanism for litellm pricing in test_track_api_cost.py to ensure accurate cost calculations.
- Introduced normalization of tool calls in server.py for Qwen3.5 model compatibility.
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ requires-python = ">=3.11"
 dependencies = [
     "openai>=2.14.0",
     "typer>=0.15.2",
-    "litellm>=1.71.1",
+    "litellm>=1.71.1,<=1.82.0",
     "weave>=0.52.24",
     "polars>=1.26.0",
     "tblib>=3.0.0",
diff --git a/src/art/api_costs.py b/src/art/api_costs.py
@@ -57,11 +57,60 @@ class _AnthropicTokenUsage:
 }
 
 
+def _litellm_price_per_million(
+    model_info: Mapping[str, Any], field: str
+) -> float | None:
+    value = model_info.get(field)
+    if value is None or isinstance(value, bool):
+        return None
+    try:
+        return float(value) * 1_000_000
+    except (TypeError, ValueError):
+        return None
+
+
+def _litellm_token_pricing(model_name: str) -> TokenPricing | None:
+    try:
+        from litellm import get_model_info
+
+        model_info = get_model_info(model_name)
+    except Exception:
+        return None
+
+    if not isinstance(model_info, Mapping):
+        return None
+
+    prompt_per_million = _litellm_price_per_million(model_info, "input_cost_per_token")
+    completion_per_million = _litellm_price_per_million(
+        model_info, "output_cost_per_token"
+    )
+    if prompt_per_million is None or completion_per_million is None:
+        return None
+
+    cache_read_per_million = _litellm_price_per_million(
+        model_info, "cache_read_input_token_cost"
+    )
+    cache_creation_per_million = _litellm_price_per_million(
+        model_info, "cache_creation_input_token_cost"
+    )
+    return TokenPricing(
+        prompt_per_million=prompt_per_million,
+        completion_per_million=completion_per_million,
+        cached_prompt_per_million=cache_read_per_million,
+        cache_creation_per_million=cache_creation_per_million,
+        cache_read_per_million=cache_read_per_million,
+    )
+
+
 def _configured_token_pricing(model_name: str) -> TokenPricing | None:
     explicit = MODEL_TOKEN_PRICING.get(model_name)
     if explicit is not None:
         return explicit
 
+    litellm_pricing = _litellm_token_pricing(model_name)
+    if litellm_pricing is not None:
+        return litellm_pricing
+
     pricing = get_model_pricing(model_name)
     if pricing is None:
         return None
diff --git a/src/art/preprocessing/tokenize.py b/src/art/preprocessing/tokenize.py
@@ -1,3 +1,4 @@
+from collections.abc import Callable
 from dataclasses import dataclass, field
 from functools import cached_property
 from itertools import takewhile
@@ -12,6 +13,22 @@
 
 from ..trajectories import History, Trajectory, TrajectoryGroup, get_messages
 
+ChatTemplateTool = dict[Any, Any] | Callable[..., Any]
+
+
+def _normalize_tools_for_chat_template(tools: Any) -> list[ChatTemplateTool] | None:
+    if tools is None:
+        return None
+    normalized_tools: list[ChatTemplateTool] = []
+    for tool in tools:
+        if callable(tool):
+            normalized_tools.append(tool)
+        elif isinstance(tool, dict) and "type" in tool:
+            normalized_tools.append(cast(dict[Any, Any], tool))
+        else:
+            normalized_tools.append({"type": "function", "function": tool})
+    return normalized_tools
+
 
 @dataclass
 class TokenizedResult:
@@ -199,11 +216,7 @@ def tokenize_trajectory(
         return None
     messages_and_choices = history.messages_and_choices[: last_assistant_index + 1]
     messages = get_messages(messages_and_choices)
-    tools: Any = (
-        [{"type": "function", "function": tool} for tool in history.tools]
-        if history.tools is not None
-        else None
-    )
+    tools = _normalize_tools_for_chat_template(history.tools)
     chat = cast(
         str,
         tokenizer.apply_chat_template(
diff --git a/src/art/tinker/renderers.py b/src/art/tinker/renderers.py
@@ -2,9 +2,9 @@ def get_renderer_name(base_model: str) -> str:
     if base_model.startswith("meta-llama/"):
         return "llama3"
     elif base_model.startswith("Qwen/Qwen3.5-"):
-        print("Defaulting to Qwen3.5 renderer with thinking for", base_model)
-        print(renderer_name_message)
-        return "qwen3_5"
+        # print("Defaulting to Qwen3.5 renderer with thinking for", base_model)
+        # print(renderer_name_message)
+        return "qwen3_5_disable_thinking"
     elif base_model.startswith("Qwen/Qwen3-"):
         if "Instruct" in base_model:
             return "qwen3_instruct"
diff --git a/src/art/tinker/server.py b/src/art/tinker/server.py
@@ -7,7 +7,7 @@
 import os
 import socket
 import time
-from typing import Annotated, AsyncGenerator, Literal
+from typing import Annotated, Any, AsyncGenerator, Literal, cast
 import uuid
 
 from fastapi import FastAPI, HTTPException, Request
@@ -47,6 +47,47 @@ class ModelUpsert(BaseModel):
     target: str
 
 
+def _normalize_qwen3_5_messages(
+    base_model: str, messages: list[ChatCompletionMessageParam]
+) -> list[dict[str, Any]]:
+    normalized_messages = [cast(dict[str, Any], message) for message in messages]
+    if not base_model.startswith("Qwen/Qwen3.5"):
+        return normalized_messages
+    for i, message in enumerate(normalized_messages):
+        tool_calls = message.get("tool_calls")
+        if not isinstance(tool_calls, list):
+            continue
+        normalized_tool_calls: list[Any] = []
+        changed = False
+        for tool_call in tool_calls:
+            if not isinstance(tool_call, dict):
+                normalized_tool_calls.append(tool_call)
+                continue
+            function = tool_call.get("function")
+            if not isinstance(function, dict):
+                normalized_tool_calls.append(tool_call)
+                continue
+            arguments_json = function.get("arguments")
+            if not isinstance(arguments_json, str):
+                normalized_tool_calls.append(tool_call)
+                continue
+            try:
+                arguments = json.loads(arguments_json)
+            except json.JSONDecodeError:
+                normalized_tool_calls.append(tool_call)
+                continue
+            if not isinstance(arguments, dict):
+                normalized_tool_calls.append(tool_call)
+                continue
+            changed = True
+            normalized_tool_calls.append(
+                {**tool_call, "function": {**function, "arguments": arguments}}
+            )
+        if changed:
+            normalized_messages[i] = {**message, "tool_calls": normalized_tool_calls}
+    return normalized_messages
+
+
 @dataclass
 class OpenAICompatibleTinkerServer:
     host: str | None = None
@@ -389,9 +430,10 @@ async def prompt_tokens(
         messages: list[ChatCompletionMessageParam],
         tools: list[ChatCompletionToolUnionParam] | None,
     ) -> list[int]:
+        normalized_messages = _normalize_qwen3_5_messages(base_model, messages)
         encoding = self._get_renderer(base_model).tokenizer.apply_chat_template(
-            messages,  # type: ignore
-            tools=tools,  # type: ignore
+            cast(Any, normalized_messages),
+            tools=cast(Any, tools),
             add_generation_prompt=True,
         )
         if isinstance(encoding, BatchEncoding):
diff --git a/tests/unit/test_track_api_cost.py b/tests/unit/test_track_api_cost.py
@@ -346,6 +346,48 @@ async def _judge() -> _AnthropicResponse:
             0.0021
         )
 
+    @pytest.mark.asyncio
+    async def test_explicit_model_name_uses_litellm_pricing_fallback(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        import litellm
+
+        builder = MetricsBuilder(cost_context="train")
+
+        def _fake_get_model_info(model_name: str) -> dict[str, float]:
+            assert model_name == "openai/fallback-model"
+            return {
+                "input_cost_per_token": 2.5e-06,
+                "output_cost_per_token": 1.5e-05,
+                "cache_read_input_token_cost": 2.5e-07,
+            }
+
+        monkeypatch.setattr(litellm, "get_model_info", _fake_get_model_info)
+
+        @track_api_cost(
+            source="llm_judge/litellm_fallback",
+            provider="openai",
+            model_name="openai/fallback-model",
+        )
+        async def _judge() -> _OpenAIResponse:
+            return _OpenAIResponse(
+                prompt_tokens=100,
+                completion_tokens=50,
+                cached_tokens=80,
+            )
+
+        token = builder.activate()
+        try:
+            await _judge()
+        finally:
+            token.var.reset(token)
+
+        metrics = await builder.flush()
+        expected = ((20 * 2.5) + (80 * 0.25) + (50 * 15.0)) / 1_000_000
+        assert metrics["costs/train/llm_judge/litellm_fallback"] == pytest.approx(
+            expected
+        )
+
     @pytest.mark.asyncio
     async def test_explicit_model_name_does_not_depend_on_response_model(self) -> None:
         builder = MetricsBuilder(cost_context="train")
diff --git a/uv.lock b/uv.lock