Add batched Modal CPU inference path for AI trading bot

Rohan5commit · Rohan5commit · commit 52207f9a8835 · 2026-04-06T13:17:51.000+08:00
diff --git a/llm_trader.py b/llm_trader.py
@@ -153,8 +153,8 @@ def propose_trades_with_llm(config, candidates, max_positions=10, allow_shorts=T
 
     predictions = []
     failures = []
-    for candidate in prompt_candidates:
-        prediction = client.predict_candidate(candidate)
+    batch_predictions = client.predict_candidates(prompt_candidates)
+    for candidate, prediction in zip(prompt_candidates, batch_predictions):
         if prediction is None:
             failures.append(
                 {
diff --git a/modal_trained_model_service.py b/modal_trained_model_service.py
@@ -0,0 +1,161 @@
+import json
+import os
+from typing import Any, Dict, List
+
+import modal
+
+APP_NAME = os.getenv("TRAINED_MODEL_MODAL_APP", "trading-bot-trained-model-inference")
+BASE_MODEL = os.getenv("TRAINED_MODEL_BASE_MODEL", "Qwen/Qwen2.5-7B-Instruct")
+VOLUME_NAME = os.getenv("TRAINED_MODEL_VOLUME", "train-once-artifacts")
+ADAPTER_PATH = os.getenv("TRAINED_MODEL_ADAPTER_PATH", "/artifacts/lora_solid_adapter")
+MODEL_NAME = os.getenv("TRAINED_MODEL_NAME", "quant-trained-trading-model")
+CPU_COUNT = int(os.getenv("TRAINED_MODEL_CPU", "8"))
+MEMORY_MB = int(os.getenv("TRAINED_MODEL_MEMORY_MB", "65536"))
+
+app = modal.App(APP_NAME)
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "fastapi>=0.115.0",
+        "pydantic>=2.9.2",
+        "torch>=2.4.1",
+        "transformers>=4.46.0",
+        "peft>=0.13.2",
+        "accelerate>=1.0.1",
+        "sentencepiece>=0.2.0",
+    )
+)
+volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=False)
+
+_MODEL = None
+_TOKENIZER = None
+_TORCH = None
+
+
+def _load_runtime():
+    global _MODEL, _TOKENIZER, _TORCH
+    if _MODEL is not None and _TOKENIZER is not None and _TORCH is not None:
+        return _MODEL, _TOKENIZER, _TORCH
+
+    import torch
+    from peft import PeftModel
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    torch.set_num_threads(max(1, CPU_COUNT))
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model = AutoModelForCausalLM.from_pretrained(
+        BASE_MODEL,
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+    )
+    model = PeftModel.from_pretrained(model, ADAPTER_PATH, is_trainable=False)
+    model.eval()
+
+    _MODEL = model
+    _TOKENIZER = tokenizer
+    _TORCH = torch
+    return _MODEL, _TOKENIZER, _TORCH
+
+
+def _candidate_prompt(candidate: Dict[str, Any]) -> str:
+    symbol = str(candidate.get("symbol") or "UNKNOWN").strip().upper()
+    as_of_date = candidate.get("as_of_date") or candidate.get("last_date") or "UNKNOWN"
+    lines = [
+        f"TICKER: {symbol}",
+        f"DATE: {as_of_date}",
+        "PRICE_ACTION:",
+        f"- last_close: {candidate.get('last_close')}",
+        f"- closes_tail: {candidate.get('closes_tail')}",
+        f"- volume_1d: {candidate.get('volume_1d')}",
+        f"- volume_20d_avg: {candidate.get('volume_20d_avg')}",
+        "INDICATORS:",
+        f"- return_1d: {candidate.get('return_1d')}",
+        f"- return_5d: {candidate.get('return_5d')}",
+        f"- return_10d: {candidate.get('return_10d')}",
+        f"- volatility_20d: {candidate.get('volatility_20d')}",
+        f"- dist_ma_20: {candidate.get('dist_ma_20')}",
+        f"- dist_ma_50: {candidate.get('dist_ma_50')}",
+        f"- rsi_14: {candidate.get('rsi_14')}",
+        f"- volume_ratio: {candidate.get('volume_ratio')}",
+        "NEWS_CONTEXT:",
+        f"- news_count_7d: {candidate.get('news_count_7d')}",
+        f"- news_sentiment_7d: {candidate.get('news_sentiment_7d')}",
+        "",
+        "QUESTION: Classify the expected 5-day return as STRONG_BUY | BUY | NEUTRAL | SELL | STRONG_SELL.",
+        "Return ONLY JSON using this schema:",
+        '{"label":"BUY","confidence":0.63,"reason":"..."}',
+    ]
+    return "\n".join(lines)
+
+
+def _extract_json(text: str):
+    if not text:
+        return None
+    text = str(text).strip()
+    try:
+        return json.loads(text)
+    except Exception:
+        pass
+    start = text.find("{")
+    end = text.rfind("}")
+    if start >= 0 and end > start:
+        try:
+            return json.loads(text[start : end + 1])
+        except Exception:
+            return None
+    return None
+
+
+def _predict_one(candidate: Dict[str, Any]) -> Dict[str, Any]:
+    model, tokenizer, torch = _load_runtime()
+    system = (
+        "You are the trained AI trading decision engine. "
+        "Return only valid JSON with label, confidence, and reason. "
+        "Use the provided market snapshot to classify the next 5-day return."
+    )
+    prompt = tokenizer.apply_chat_template(
+        [
+            {"role": "system", "content": system},
+            {"role": "user", "content": _candidate_prompt(candidate)},
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    encoded = tokenizer(prompt, return_tensors="pt")
+    input_len = encoded["input_ids"].shape[-1]
+    with torch.no_grad():
+        generated = model.generate(
+            **encoded,
+            max_new_tokens=64,
+            do_sample=False,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    text = tokenizer.decode(generated[0][input_len:], skip_special_tokens=True).strip()
+    parsed = _extract_json(text) or {"label": "NEUTRAL", "confidence": 0.5, "reason": text or "No parsable output."}
+    parsed["symbol"] = candidate.get("symbol")
+    return parsed
+
+
+@app.function(
+    image=image,
+    cpu=CPU_COUNT,
+    memory=MEMORY_MB,
+    scaledown_window=300,
+    timeout=3600,
+    volumes={"/artifacts": volume},
+)
+@modal.web_endpoint(method="POST")
+def predict_trade_candidates(payload: Dict[str, Any]):
+    candidates = payload.get("candidates") or []
+    if not isinstance(candidates, list):
+        candidate = payload.get("candidate")
+        candidates = [candidate] if isinstance(candidate, dict) else []
+    signals = [_predict_one(c) for c in candidates if isinstance(c, dict)]
+    return {
+        "model": MODEL_NAME,
+        "model_used": MODEL_NAME,
+        "signals": signals,
+    }
diff --git a/trained_model_client.py b/trained_model_client.py
@@ -2,7 +2,7 @@
 import logging
 import os
 import re
-from typing import Optional
+from typing import List, Optional
 
 import requests
 
@@ -33,7 +33,7 @@ def __init__(self, ai_cfg: Optional[dict] = None):
         self.api_key_env = str(model_cfg.get("api_key_env", "") or "").strip()
         self.api_key = os.getenv(self.api_key_env).strip() if self.api_key_env and os.getenv(self.api_key_env) else ""
         self.timeout_seconds = int(model_cfg.get("timeout_seconds", 60) or 60)
-        self.model_name = str(model_cfg.get("model_name", "trained-trading-model") or "trained-trading-model").strip()
+        self.model_name = str(model_cfg.get("model_name", "quant-trained-trading-model") or "quant-trained-trading-model").strip()
         self.last_error = None
         self.last_model_used = None
 
@@ -51,19 +51,32 @@ def is_ready(self) -> bool:
         return True
 
     def predict_candidate(self, candidate: dict) -> Optional[dict]:
+        results = self.predict_candidates([candidate])
+        return results[0] if results else None
+
+    def predict_candidates(self, candidates: List[dict]) -> List[Optional[dict]]:
         if not self.is_ready():
-            return None
+            return [None for _ in list(candidates or [])]
+        payload_candidates = [dict(c or {}) for c in list(candidates or []) if isinstance(c, dict)]
+        if not payload_candidates:
+            return []
         try:
-            raw = self._predict_http(candidate)
+            raw_signals = self._predict_batch_http(payload_candidates)
         except Exception as exc:
             self.last_error = str(exc)
-            logger.warning("Trained model inference failed for %s: %s", candidate.get("symbol"), exc)
-            return None
-        return self._normalize_prediction(raw)
+            logger.warning("Trained model batch inference failed: %s", exc)
+            return [None for _ in payload_candidates]
+
+        out = []
+        for signal in raw_signals:
+            out.append(self._normalize_prediction(signal))
+        while len(out) < len(payload_candidates):
+            out.append(None)
+        return out[: len(payload_candidates)]
 
-    def _predict_http(self, candidate: dict):
+    def _predict_batch_http(self, candidates: List[dict]):
         payload = {
-            "candidate": candidate,
+            "candidates": candidates,
             "task": "trade_signal_classification",
         }
         headers = {"Content-Type": "application/json", "Accept": "application/json"}
@@ -73,9 +86,13 @@ def _predict_http(self, candidate: dict):
         response.raise_for_status()
         data = response.json()
         self.last_model_used = data.get("model") or data.get("model_used") or self.model_identifier
-        if isinstance(data.get("signal"), dict):
-            return data["signal"]
-        return data
+        signals = data.get("signals")
+        if isinstance(signals, list):
+            return signals
+        signal = data.get("signal")
+        if signal is not None:
+            return [signal]
+        return []
 
     def _normalize_prediction(self, raw) -> Optional[dict]:
         parsed = raw