feat: add multimodal get_image tool to query agent

rejojer · rejojer · commit 0340cb135577 · 2026-04-10T07:46:42.000+08:00
Query agent can now view images referenced in source documents via
get_image tool, which returns ToolOutputImage for the LLM to inspect.
Prompt updated to use images when questions involve figures or visuals.
diff --git a/openkb/agent/query.py b/openkb/agent/query.py
@@ -5,7 +5,8 @@
 
 from agents import Agent, Runner, function_tool
 
-from openkb.agent.tools import read_wiki_file
+from agents import ToolOutputImage, ToolOutputText
+from openkb.agent.tools import read_wiki_file, read_wiki_image
 
 MAX_TURNS = 50
 from openkb.schema import get_agents_md
@@ -27,10 +28,13 @@
    - PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages)
      with tight page ranges. The summary shows document tree structure with page
      ranges to help you target. Never fetch the whole document.
-5. Synthesize a clear, concise, well-cited answer grounded in wiki content.
+5. When source content references images (e.g. ![image](sources/images/doc/file.png)),
+   use get_image to view them. Always view images when the question asks about
+   a figure, chart, diagram, or visual content.
+6. Synthesize a clear, concise, well-cited answer grounded in wiki content.
 
 Answer based only on wiki content. Be concise.
-Before each tool call, briefly state what you are about to do.
+Before each tool call, output one short sentence explaining the reason.
 
 If you cannot find relevant information, say so clearly.
 """
@@ -62,12 +66,24 @@ def get_page_content_tool(doc_name: str, pages: str) -> str:
         from openkb.agent.tools import get_page_content
         return get_page_content(doc_name, pages, wiki_root)
 
+    @function_tool
+    def get_image(image_path: str) -> ToolOutputImage | ToolOutputText:
+        """View an image from the wiki.
+        Use when source content references images you need to see.
+        Args:
+            image_path: Image path relative to wiki root (e.g. 'sources/images/doc/p1_img1.png').
+        """
+        result = read_wiki_image(image_path, wiki_root)
+        if result["type"] == "image":
+            return ToolOutputImage(image_url=result["image_url"])
+        return ToolOutputText(text=result["text"])
+
     from agents.model_settings import ModelSettings
 
     return Agent(
         name="wiki-query",
         instructions=instructions,
-        tools=[read_file, get_page_content_tool],
+        tools=[read_file, get_page_content_tool, get_image],
         model=f"litellm/{model}",
         model_settings=ModelSettings(parallel_tool_calls=False),
     )
diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py
@@ -133,6 +133,41 @@ def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str:
     return "\n\n".join(parts) + "\n\n"
 
 
+_MIME_TYPES = {
+    ".png": "image/png",
+    ".jpg": "image/jpeg",
+    ".jpeg": "image/jpeg",
+    ".gif": "image/gif",
+    ".webp": "image/webp",
+    ".bmp": "image/bmp",
+}
+
+
+def read_wiki_image(path: str, wiki_root: str) -> dict:
+    """Read an image file from the wiki and return as base64 data URL.
+
+    Args:
+        path: Image path relative to *wiki_root* (e.g. ``"sources/images/doc/p1_img1.png"``).
+        wiki_root: Absolute path to the wiki root directory.
+
+    Returns:
+        A dict with ``type``, ``image_url`` keys for ``ToolOutputImage``,
+        or a dict with ``type``, ``text`` keys on error.
+    """
+    import base64
+
+    root = Path(wiki_root).resolve()
+    full_path = (root / path).resolve()
+    if not full_path.is_relative_to(root):
+        return {"type": "text", "text": "Access denied: path escapes wiki root."}
+    if not full_path.exists():
+        return {"type": "text", "text": f"Image not found: {path}"}
+
+    mime = _MIME_TYPES.get(full_path.suffix.lower(), "image/png")
+    b64 = base64.b64encode(full_path.read_bytes()).decode()
+    return {"type": "image", "image_url": f"data:{mime};base64,{b64}"}
+
+
 def write_wiki_file(path: str, content: str, wiki_root: str) -> str:
     """Write or overwrite a Markdown file in the wiki.
 
diff --git a/tests/test_query.py b/tests/test_query.py
@@ -15,17 +15,16 @@ def test_agent_name(self, tmp_path):
         agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         assert agent.name == "wiki-query"
 
-    def test_agent_has_two_tools(self, tmp_path):
+    def test_agent_has_three_tools(self, tmp_path):
         agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
-        assert len(agent.tools) == 2
+        assert len(agent.tools) == 3
 
     def test_agent_tool_names(self, tmp_path):
         agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         names = {t.name for t in agent.tools}
         assert "read_file" in names
         assert "get_page_content_tool" in names
-        assert "list_files" not in names
-        assert "pageindex_retrieve" not in names
+        assert "get_image" in names
 
     def test_instructions_mention_get_page_content(self, tmp_path):
         agent = build_query_agent(str(tmp_path), "gpt-4o-mini")