Sanitize persisted chat image history

rejojer · rejojer · commit 822dbc3a11ce · 2026-04-11T23:08:56.000+08:00
diff --git a/openkb/agent/chat_session.py b/openkb/agent/chat_session.py
@@ -1,9 +1,10 @@
 """Chat session persistence for `openkb chat`.
 
-Each session lives in ``<kb>/.openkb/chats/<id>.json`` and stores the full
+Each session lives in ``<kb>/.openkb/chats/<id>.json`` and stores a sanitized
 agent-SDK history (from ``RunResult.to_input_list()``) alongside the user
 messages and full assistant replies kept as plain strings for display and
-export.
+export. Large tool-returned image payloads are replaced with lightweight
+references before the history is reused or persisted.
 """
 from __future__ import annotations
 
@@ -17,6 +18,11 @@
 from typing import Any
 
 
+_IMAGE_HISTORY_NOTE = (
+    "Image output omitted from chat history to avoid persisting raw data URLs."
+)
+
+
 def _utcnow_iso() -> str:
     return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
 
@@ -38,6 +44,71 @@ def _title_from(msg: str, limit: int = 60) -> str:
     return msg[: limit - 1] + "\u2026"
 
 
+def _image_history_placeholder(image_path: str | None) -> dict[str, str]:
+    text = _IMAGE_HISTORY_NOTE
+    if image_path:
+        text += f" Source path: {image_path}."
+    text += " Call get_image again if you need to inspect it."
+    return {"type": "input_text", "text": text}
+
+
+def _extract_get_image_path(item: dict[str, Any]) -> str | None:
+    if item.get("type") != "function_call" or item.get("name") != "get_image":
+        return None
+    arguments = item.get("arguments")
+    if not isinstance(arguments, str):
+        return None
+    try:
+        payload = json.loads(arguments)
+    except json.JSONDecodeError:
+        return None
+    image_path = payload.get("image_path")
+    if isinstance(image_path, str) and image_path:
+        return image_path
+    return None
+
+
+def _sanitize_history_value(value: Any, image_path: str | None = None) -> Any:
+    if isinstance(value, list):
+        return [_sanitize_history_value(item, image_path) for item in value]
+    if not isinstance(value, dict):
+        return value
+
+    if value.get("type") == "input_image":
+        image_url = value.get("image_url")
+        if isinstance(image_url, str) and image_url.startswith("data:"):
+            return _image_history_placeholder(image_path)
+
+    return {
+        key: _sanitize_history_value(item, image_path)
+        for key, item in value.items()
+    }
+
+
+def sanitize_history(history: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Strip large image payloads from model history while keeping a re-fetch hint."""
+    image_paths_by_call_id: dict[str, str] = {}
+    sanitized: list[dict[str, Any]] = []
+
+    for item in history:
+        if not isinstance(item, dict):
+            sanitized.append(item)
+            continue
+
+        image_path = _extract_get_image_path(item)
+        call_id = item.get("call_id")
+        if image_path and isinstance(call_id, str):
+            image_paths_by_call_id[call_id] = image_path
+
+        history_image_path = None
+        if item.get("type") == "function_call_output" and isinstance(call_id, str):
+            history_image_path = image_paths_by_call_id.get(call_id)
+
+        sanitized.append(_sanitize_history_value(item, history_image_path))
+
+    return sanitized
+
+
 @dataclass
 class ChatSession:
     id: str
@@ -99,7 +170,7 @@ def record_turn(
         assistant_text: str,
         new_history: list[dict[str, Any]],
     ) -> None:
-        self.history = new_history
+        self.history = sanitize_history(new_history)
         self.user_turns.append(user_message)
         self.assistant_texts.append(assistant_text)
         self.turn_count = len(self.user_turns)
@@ -120,7 +191,7 @@ def load_session(kb_dir: Path, session_id: str) -> ChatSession:
         language=data.get("language", "en"),
         title=data.get("title", ""),
         turn_count=data.get("turn_count", 0),
-        history=data.get("history", []),
+        history=sanitize_history(data.get("history", [])),
         user_turns=data.get("user_turns", []),
         assistant_texts=data.get("assistant_texts", []),
         path=path,
diff --git a/tests/test_chat_session.py b/tests/test_chat_session.py
@@ -0,0 +1,76 @@
+"""Tests for chat session persistence."""
+from __future__ import annotations
+
+import json
+
+from openkb.agent.chat_session import ChatSession, load_session
+
+
+def _image_history() -> list[dict[str, object]]:
+    return [
+        {"role": "user", "content": "Describe the diagram."},
+        {
+            "type": "function_call",
+            "call_id": "call_123",
+            "name": "get_image",
+            "arguments": '{"image_path":"sources/images/doc/figure-1.png"}',
+        },
+        {
+            "type": "function_call_output",
+            "call_id": "call_123",
+            "output": [
+                {
+                    "type": "input_image",
+                    "image_url": "data:image/png;base64,AAAA",
+                }
+            ],
+        },
+    ]
+
+
+def test_record_turn_replaces_data_image_with_text_reference(tmp_path):
+    session = ChatSession.new(tmp_path, "gpt-4o-mini", "en")
+
+    session.record_turn(
+        "Describe the diagram.",
+        "It is a flow chart.",
+        _image_history(),
+    )
+
+    saved = json.loads(session.path.read_text(encoding="utf-8"))
+    output_part = saved["history"][2]["output"][0]
+
+    assert output_part["type"] == "input_text"
+    assert "data:image/png;base64,AAAA" not in session.path.read_text(encoding="utf-8")
+    assert "sources/images/doc/figure-1.png" in output_part["text"]
+    assert "Call get_image again" in output_part["text"]
+
+
+def test_load_session_sanitizes_legacy_image_history(tmp_path):
+    session = ChatSession.new(tmp_path, "gpt-4o-mini", "en")
+    raw_history = _image_history()
+    session.path.parent.mkdir(parents=True, exist_ok=True)
+    session.path.write_text(
+        json.dumps(
+            {
+                "id": session.id,
+                "created_at": session.created_at,
+                "updated_at": session.updated_at,
+                "model": session.model,
+                "language": session.language,
+                "title": "",
+                "turn_count": 1,
+                "history": raw_history,
+                "user_turns": ["Describe the diagram."],
+                "assistant_texts": ["It is a flow chart."],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    loaded = load_session(tmp_path, session.id)
+
+    output_part = loaded.history[2]["output"][0]
+    assert output_part["type"] == "input_text"
+    assert "data:image/png;base64,AAAA" not in output_part["text"]
+    assert "sources/images/doc/figure-1.png" in output_part["text"]