Skip to content

Commit 49afbdb

Browse files
committed
feat: replace pageindex_retrieve with get_page_content, unify query for all docs
Remove _pageindex_retrieve_impl and the pageindex_retrieve tool; add get_page_content_tool that uses the local JSON-based page store for all long documents. Update instructions and schema description accordingly.
1 parent cc6215a commit 49afbdb

3 files changed

Lines changed: 32 additions & 239 deletions

File tree

openkb/agent/query.py

Lines changed: 19 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,7 @@
33

44
from pathlib import Path
55

6-
import litellm
76
from agents import Agent, Runner, function_tool
8-
import os
9-
10-
from pageindex import PageIndexClient
117

128
from openkb.agent.tools import list_wiki_files, read_wiki_file
139
from openkb.schema import SCHEMA_MD, get_agents_md
@@ -18,152 +14,30 @@
1814
{schema_md}
1915
2016
## Search strategy
21-
1. Start by reading index.md to understand what documents and concepts are available.
22-
2. Read relevant summary pages (summaries/) to get document overviews.
17+
1. Read index.md to understand what documents and concepts are available.
18+
Each entry has a brief summary to help you judge relevance.
19+
2. Read relevant summary pages (summaries/) for document overviews.
2320
3. Read concept pages (concepts/) for cross-document synthesis.
24-
4. For long documents indexed with PageIndex, call pageindex_retrieve with the
25-
document ID and the user's question to get detailed page-level content.
21+
4. For long documents, use get_page_content(doc_name, pages) to read
22+
specific pages when you need detailed content. The summary page
23+
shows chapter structure with page ranges to help you decide which
24+
pages to read.
2625
5. Synthesise a clear, well-cited answer.
2726
2827
Always ground your answer in the wiki content. If you cannot find relevant
2928
information, say so clearly.
3029
"""
3130

3231

33-
def _pageindex_retrieve_impl(doc_id: str, question: str, openkb_dir: str, model: str) -> str:
34-
"""Retrieve relevant content from a long document via PageIndex.
35-
36-
For cloud-indexed docs: delegates to col.query() directly.
37-
For local docs: uses structure-based page selection + get_page_content.
38-
"""
39-
pageindex_api_key = os.environ.get("PAGEINDEX_API_KEY", "")
40-
# Determine if this doc was cloud-indexed (cloud doc_ids have "pi-" prefix)
41-
is_cloud_doc = doc_id.startswith("pi-")
42-
43-
if is_cloud_doc:
44-
# Cloud doc: use PageIndex streaming query (avoids timeout, shows progress)
45-
import sys
46-
import asyncio
47-
import threading
48-
49-
client = PageIndexClient(api_key=pageindex_api_key or None, model=model)
50-
col = client.collection()
51-
try:
52-
stream = col.query(question, doc_ids=[doc_id], stream=True)
53-
collected: list[str] = []
54-
done = threading.Event()
55-
56-
async def _consume():
57-
try:
58-
async for event in stream:
59-
if event.type == "answer_delta":
60-
sys.stdout.write(event.data)
61-
sys.stdout.flush()
62-
collected.append(event.data)
63-
elif event.type == "tool_call":
64-
name = event.data.get("name", "")
65-
args = event.data.get("args", "")
66-
sys.stdout.write(f"\n [PageIndex] {name}({args})\n")
67-
sys.stdout.flush()
68-
sys.stdout.write("\n")
69-
sys.stdout.flush()
70-
finally:
71-
done.set()
72-
73-
# Run streaming in a separate thread with its own event loop
74-
def _run():
75-
loop = asyncio.new_event_loop()
76-
loop.run_until_complete(_consume())
77-
loop.close()
78-
79-
t = threading.Thread(target=_run, daemon=True)
80-
t.start()
81-
t.join(timeout=120)
82-
return "".join(collected) if collected else "No answer from PageIndex."
83-
except Exception as exc:
84-
return f"Error querying cloud PageIndex: {exc}"
85-
86-
# Local doc: use local PageIndex with structure-based retrieval
87-
client = PageIndexClient(model=model, storage_path=openkb_dir)
88-
col = client.collection()
89-
90-
try:
91-
structure = col.get_document_structure(doc_id)
92-
except Exception as exc:
93-
return f"Error retrieving document structure: {exc}"
94-
95-
if not structure:
96-
return "No structure found for document."
97-
sections = []
98-
for idx, node in enumerate(structure):
99-
title = node.get("title", f"Section {idx + 1}")
100-
node_id = node.get("node_id", str(idx))
101-
summary = node.get("summary", "")
102-
start = node.get("start_index", idx)
103-
end = node.get("end_index", idx)
104-
sections.append(
105-
f"node_id={node_id} title='{title}' pages={start}-{end} summary='{summary}'"
106-
)
107-
108-
sections_text = "\n".join(sections)
109-
prompt = (
110-
f"Given the following document sections:\n{sections_text}\n\n"
111-
f"Which page ranges are most relevant to this question: '{question}'?\n"
112-
"Reply with a comma-separated list of page numbers or ranges (e.g. '1-3,7,10-12'). "
113-
"Return ONLY the page specification, nothing else."
114-
)
115-
116-
# 2. Ask LLM which pages are relevant
117-
try:
118-
response = litellm.completion(
119-
model=model,
120-
messages=[{"role": "user", "content": prompt}],
121-
)
122-
page_spec = response.choices[0].message.content.strip()
123-
except Exception as exc:
124-
return f"Error selecting relevant pages: {exc}"
125-
126-
if not page_spec:
127-
return "Could not determine relevant pages."
128-
129-
# 3. Fetch those pages
130-
try:
131-
pages = col.get_page_content(doc_id, page_spec)
132-
except Exception as exc:
133-
return f"Error fetching page content: {exc}"
134-
135-
if not pages:
136-
return f"No content found for pages: {page_spec}"
137-
138-
parts = []
139-
for item in pages:
140-
page_num = item.get("page_index", "?")
141-
text = item.get("text", "")
142-
parts.append(f"[Page {page_num}]\n{text}")
143-
144-
return "\n\n".join(parts)
145-
146-
147-
def build_query_agent(wiki_root: str, openkb_dir: str, model: str, language: str = "en") -> Agent:
148-
"""Build and return the Q&A agent.
149-
150-
Args:
151-
wiki_root: Absolute path to the wiki directory.
152-
openkb_dir: Path to the .openkb/ state directory.
153-
model: LLM model name.
154-
language: Language code for wiki content (e.g. 'en', 'fr').
155-
156-
Returns:
157-
Configured :class:`~agents.Agent` instance.
158-
"""
32+
def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent:
33+
"""Build and return the Q&A agent."""
15934
schema_md = get_agents_md(Path(wiki_root))
16035
instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md)
16136
instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language."
16237

16338
@function_tool
16439
def list_files(directory: str) -> str:
16540
"""List all Markdown files in a wiki subdirectory.
166-
16741
Args:
16842
directory: Subdirectory path relative to wiki root (e.g. 'sources').
16943
"""
@@ -172,31 +46,29 @@ def list_files(directory: str) -> str:
17246
@function_tool
17347
def read_file(path: str) -> str:
17448
"""Read a Markdown file from the wiki.
175-
17649
Args:
17750
path: File path relative to wiki root (e.g. 'summaries/paper.md').
17851
"""
17952
return read_wiki_file(path, wiki_root)
18053

18154
@function_tool
182-
def pageindex_retrieve(doc_id: str, question: str) -> str:
183-
"""Retrieve relevant content from a long document via PageIndex.
184-
185-
Use this when you need detailed content from a document that was
186-
indexed with PageIndex (long documents).
187-
55+
def get_page_content_tool(doc_name: str, pages: str) -> str:
56+
"""Get text content of specific pages from a long document.
57+
Use this when you need detailed content from a document. The summary
58+
page shows chapter structure with page ranges.
18859
Args:
189-
doc_id: PageIndex document identifier (found in index.md).
190-
question: The question you are trying to answer.
60+
doc_name: Document name (e.g. 'attention-is-all-you-need').
61+
pages: Page specification (e.g. '3-5,7,10-12').
19162
"""
192-
return _pageindex_retrieve_impl(doc_id, question, openkb_dir, model)
63+
from openkb.agent.tools import get_page_content
64+
return get_page_content(doc_name, pages, wiki_root)
19365

19466
from agents.model_settings import ModelSettings
19567

19668
return Agent(
19769
name="wiki-query",
19870
instructions=instructions,
199-
tools=[list_files, read_file, pageindex_retrieve],
71+
tools=[list_files, read_file, get_page_content_tool],
20072
model=f"litellm/{model}",
20173
model_settings=ModelSettings(parallel_tool_calls=False),
20274
)
@@ -224,9 +96,8 @@ async def run_query(question: str, kb_dir: Path, model: str, stream: bool = Fals
22496
language: str = config.get("language", "en")
22597

22698
wiki_root = str(kb_dir / "wiki")
227-
openkb_path = str(openkb_dir)
22899

229-
agent = build_query_agent(wiki_root, openkb_path, model, language=language)
100+
agent = build_query_agent(wiki_root, model, language=language)
230101

231102
if not stream:
232103
result = await Runner.run(agent, question)

openkb/schema.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# Wiki Schema
77
88
## Directory Structure
9-
- sources/ — Full-text converted from raw documents. Do not modify directly.
9+
- sources/ — Document content. Short docs as .md, long docs as .json (per-page). Do not modify directly.
1010
- sources/images/ — Extracted images from documents, referenced by sources.
1111
- summaries/ — One per source document. Summary of key content.
1212
- concepts/ — Cross-document topic synthesis. Created when a theme spans multiple documents.

tests/test_query.py

Lines changed: 12 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -6,119 +6,41 @@
66

77
import pytest
88

9-
from openkb.agent.query import _pageindex_retrieve_impl, build_query_agent, run_query
9+
from openkb.agent.query import build_query_agent, run_query
1010
from openkb.schema import SCHEMA_MD
1111

1212

1313
class TestBuildQueryAgent:
1414
def test_agent_name(self, tmp_path):
15-
agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
15+
agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
1616
assert agent.name == "wiki-query"
1717

1818
def test_agent_has_three_tools(self, tmp_path):
19-
agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
19+
agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
2020
assert len(agent.tools) == 3
2121

2222
def test_agent_tool_names(self, tmp_path):
23-
agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
23+
agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
2424
names = {t.name for t in agent.tools}
2525
assert "list_files" in names
2626
assert "read_file" in names
27-
assert "pageindex_retrieve" in names
27+
assert "get_page_content_tool" in names
28+
assert "pageindex_retrieve" not in names
2829

29-
def test_instructions_reference_registered_pageindex_tool(self, tmp_path):
30-
agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
31-
tool_names = {t.name for t in agent.tools}
32-
assert "pageindex_retrieve" in agent.instructions
33-
assert "pageindex_retrieve" in tool_names
30+
def test_instructions_mention_get_page_content(self, tmp_path):
31+
agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
32+
assert "get_page_content" in agent.instructions
33+
assert "pageindex_retrieve" not in agent.instructions
3434

3535
def test_schema_in_instructions(self, tmp_path):
36-
agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
36+
agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
3737
assert SCHEMA_MD in agent.instructions
3838

3939
def test_agent_model(self, tmp_path):
40-
agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "my-model")
40+
agent = build_query_agent(str(tmp_path), "my-model")
4141
assert agent.model == "litellm/my-model"
4242

4343

44-
class TestPageindexRetrieve:
45-
def test_returns_page_content(self, tmp_path):
46-
mock_structure = [
47-
{
48-
"node_id": "n1",
49-
"title": "Introduction",
50-
"start_index": 1,
51-
"end_index": 5,
52-
"summary": "Overview section",
53-
}
54-
]
55-
mock_pages = [
56-
{"page_index": 1, "text": "Introduction text here."},
57-
{"page_index": 2, "text": "More intro content."},
58-
]
59-
60-
mock_col = MagicMock()
61-
mock_col.get_document_structure.return_value = mock_structure
62-
mock_col.get_page_content.return_value = mock_pages
63-
64-
mock_client = MagicMock()
65-
mock_client.collection.return_value = mock_col
66-
67-
with patch("openkb.agent.query.PageIndexClient", return_value=mock_client), \
68-
patch("openkb.agent.query.litellm.completion") as mock_llm, \
69-
patch.dict("os.environ", {"PAGEINDEX_API_KEY": ""}, clear=False):
70-
mock_llm.return_value = MagicMock(
71-
choices=[MagicMock(message=MagicMock(content="1-2"))]
72-
)
73-
result = _pageindex_retrieve_impl("doc123", "What is the intro?", "/db", "gpt-4o-mini")
74-
75-
assert "Introduction text here." in result
76-
assert "More intro content." in result
77-
78-
def test_cloud_doc_uses_streaming_query(self, tmp_path):
79-
"""Cloud doc (pi- prefix) delegates to col.query(stream=True)."""
80-
from dataclasses import dataclass
81-
from typing import Any
82-
83-
@dataclass
84-
class FakeEvent:
85-
type: str
86-
data: Any
87-
88-
class FakeStream:
89-
async def __aiter__(self):
90-
yield FakeEvent(type="answer_delta", data="Cloud ")
91-
yield FakeEvent(type="answer_delta", data="answer about MCP.")
92-
93-
mock_stream = FakeStream()
94-
95-
mock_col = MagicMock()
96-
mock_col.query.return_value = mock_stream
97-
98-
mock_client = MagicMock()
99-
mock_client.collection.return_value = mock_col
100-
101-
with patch("openkb.agent.query.PageIndexClient", return_value=mock_client):
102-
result = _pageindex_retrieve_impl("pi-abc123", "What is MCP?", "/db", "gpt-4o-mini")
103-
104-
assert "Cloud answer about MCP." in result
105-
mock_col.query.assert_called_once_with("What is MCP?", doc_ids=["pi-abc123"], stream=True)
106-
107-
def test_local_empty_structure_returns_error(self, tmp_path):
108-
"""Local doc with empty structure returns error."""
109-
mock_col = MagicMock()
110-
mock_col.get_document_structure.return_value = []
111-
112-
mock_client = MagicMock()
113-
mock_client.collection.return_value = mock_col
114-
115-
with patch("openkb.agent.query.PageIndexClient", return_value=mock_client), \
116-
patch.dict("os.environ", {"PAGEINDEX_API_KEY": ""}, clear=False):
117-
result = _pageindex_retrieve_impl("local-uuid-123", "What?", "/db", "gpt-4o-mini")
118-
119-
assert "No structure found" in result
120-
121-
12244
class TestRunQuery:
12345
@pytest.mark.asyncio
12446
async def test_run_query_returns_final_output(self, tmp_path):

0 commit comments

Comments
 (0)