Skip to content

Commit 0340cb1

Browse files
committed
feat: add multimodal get_image tool to query agent
Query agent can now view images referenced in source documents via get_image tool, which returns ToolOutputImage for the LLM to inspect. Prompt updated to use images when questions involve figures or visuals.
1 parent ad05577 commit 0340cb1

3 files changed

Lines changed: 58 additions & 8 deletions

File tree

openkb/agent/query.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55

66
from agents import Agent, Runner, function_tool
77

8-
from openkb.agent.tools import read_wiki_file
8+
from agents import ToolOutputImage, ToolOutputText
9+
from openkb.agent.tools import read_wiki_file, read_wiki_image
910

1011
MAX_TURNS = 50
1112
from openkb.schema import get_agents_md
@@ -27,10 +28,13 @@
2728
- PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages)
2829
with tight page ranges. The summary shows document tree structure with page
2930
ranges to help you target. Never fetch the whole document.
30-
5. Synthesize a clear, concise, well-cited answer grounded in wiki content.
31+
5. When source content references images (e.g. ![image](sources/images/doc/file.png)),
32+
use get_image to view them. Always view images when the question asks about
33+
a figure, chart, diagram, or visual content.
34+
6. Synthesize a clear, concise, well-cited answer grounded in wiki content.
3135
3236
Answer based only on wiki content. Be concise.
33-
Before each tool call, briefly state what you are about to do.
37+
Before each tool call, output one short sentence explaining the reason.
3438
3539
If you cannot find relevant information, say so clearly.
3640
"""
@@ -62,12 +66,24 @@ def get_page_content_tool(doc_name: str, pages: str) -> str:
6266
from openkb.agent.tools import get_page_content
6367
return get_page_content(doc_name, pages, wiki_root)
6468

69+
@function_tool
70+
def get_image(image_path: str) -> ToolOutputImage | ToolOutputText:
71+
"""View an image from the wiki.
72+
Use when source content references images you need to see.
73+
Args:
74+
image_path: Image path relative to wiki root (e.g. 'sources/images/doc/p1_img1.png').
75+
"""
76+
result = read_wiki_image(image_path, wiki_root)
77+
if result["type"] == "image":
78+
return ToolOutputImage(image_url=result["image_url"])
79+
return ToolOutputText(text=result["text"])
80+
6581
from agents.model_settings import ModelSettings
6682

6783
return Agent(
6884
name="wiki-query",
6985
instructions=instructions,
70-
tools=[read_file, get_page_content_tool],
86+
tools=[read_file, get_page_content_tool, get_image],
7187
model=f"litellm/{model}",
7288
model_settings=ModelSettings(parallel_tool_calls=False),
7389
)

openkb/agent/tools.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,41 @@ def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str:
133133
return "\n\n".join(parts) + "\n\n"
134134

135135

136+
_MIME_TYPES = {
137+
".png": "image/png",
138+
".jpg": "image/jpeg",
139+
".jpeg": "image/jpeg",
140+
".gif": "image/gif",
141+
".webp": "image/webp",
142+
".bmp": "image/bmp",
143+
}
144+
145+
146+
def read_wiki_image(path: str, wiki_root: str) -> dict:
147+
"""Read an image file from the wiki and return as base64 data URL.
148+
149+
Args:
150+
path: Image path relative to *wiki_root* (e.g. ``"sources/images/doc/p1_img1.png"``).
151+
wiki_root: Absolute path to the wiki root directory.
152+
153+
Returns:
154+
A dict with ``type``, ``image_url`` keys for ``ToolOutputImage``,
155+
or a dict with ``type``, ``text`` keys on error.
156+
"""
157+
import base64
158+
159+
root = Path(wiki_root).resolve()
160+
full_path = (root / path).resolve()
161+
if not full_path.is_relative_to(root):
162+
return {"type": "text", "text": "Access denied: path escapes wiki root."}
163+
if not full_path.exists():
164+
return {"type": "text", "text": f"Image not found: {path}"}
165+
166+
mime = _MIME_TYPES.get(full_path.suffix.lower(), "image/png")
167+
b64 = base64.b64encode(full_path.read_bytes()).decode()
168+
return {"type": "image", "image_url": f"data:{mime};base64,{b64}"}
169+
170+
136171
def write_wiki_file(path: str, content: str, wiki_root: str) -> str:
137172
"""Write or overwrite a Markdown file in the wiki.
138173

tests/test_query.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,16 @@ def test_agent_name(self, tmp_path):
1515
agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
1616
assert agent.name == "wiki-query"
1717

18-
def test_agent_has_two_tools(self, tmp_path):
18+
def test_agent_has_three_tools(self, tmp_path):
1919
agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
20-
assert len(agent.tools) == 2
20+
assert len(agent.tools) == 3
2121

2222
def test_agent_tool_names(self, tmp_path):
2323
agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
2424
names = {t.name for t in agent.tools}
2525
assert "read_file" in names
2626
assert "get_page_content_tool" in names
27-
assert "list_files" not in names
28-
assert "pageindex_retrieve" not in names
27+
assert "get_image" in names
2928

3029
def test_instructions_mention_get_page_content(self, tmp_path):
3130
agent = build_query_agent(str(tmp_path), "gpt-4o-mini")

0 commit comments

Comments
 (0)