feat: replace pageindex_retrieve with get_page_content, unify query for all docs

KylinMountain · KylinMountain · commit 49afbdb50828 · 2026-04-09T23:43:09.000+08:00
Remove _pageindex_retrieve_impl and the pageindex_retrieve tool; add
get_page_content_tool that uses the local JSON-based page store for all
long documents. Update instructions and schema description accordingly.
diff --git a/openkb/agent/query.py b/openkb/agent/query.py
@@ -3,11 +3,7 @@
 
 from pathlib import Path
 
-import litellm
 from agents import Agent, Runner, function_tool
-import os
-
-from pageindex import PageIndexClient
 
 from openkb.agent.tools import list_wiki_files, read_wiki_file
 from openkb.schema import SCHEMA_MD, get_agents_md
@@ -18,152 +14,30 @@
 {schema_md}
 
 ## Search strategy
-1. Start by reading index.md to understand what documents and concepts are available.
-2. Read relevant summary pages (summaries/) to get document overviews.
+1. Read index.md to understand what documents and concepts are available.
+   Each entry has a brief summary to help you judge relevance.
+2. Read relevant summary pages (summaries/) for document overviews.
 3. Read concept pages (concepts/) for cross-document synthesis.
-4. For long documents indexed with PageIndex, call pageindex_retrieve with the
-   document ID and the user's question to get detailed page-level content.
+4. For long documents, use get_page_content(doc_name, pages) to read
+   specific pages when you need detailed content. The summary page
+   shows chapter structure with page ranges to help you decide which
+   pages to read.
 5. Synthesise a clear, well-cited answer.
 
 Always ground your answer in the wiki content. If you cannot find relevant
 information, say so clearly.
 """
 
 
-def _pageindex_retrieve_impl(doc_id: str, question: str, openkb_dir: str, model: str) -> str:
-    """Retrieve relevant content from a long document via PageIndex.
-
-    For cloud-indexed docs: delegates to col.query() directly.
-    For local docs: uses structure-based page selection + get_page_content.
-    """
-    pageindex_api_key = os.environ.get("PAGEINDEX_API_KEY", "")
-    # Determine if this doc was cloud-indexed (cloud doc_ids have "pi-" prefix)
-    is_cloud_doc = doc_id.startswith("pi-")
-
-    if is_cloud_doc:
-        # Cloud doc: use PageIndex streaming query (avoids timeout, shows progress)
-        import sys
-        import asyncio
-        import threading
-
-        client = PageIndexClient(api_key=pageindex_api_key or None, model=model)
-        col = client.collection()
-        try:
-            stream = col.query(question, doc_ids=[doc_id], stream=True)
-            collected: list[str] = []
-            done = threading.Event()
-
-            async def _consume():
-                try:
-                    async for event in stream:
-                        if event.type == "answer_delta":
-                            sys.stdout.write(event.data)
-                            sys.stdout.flush()
-                            collected.append(event.data)
-                        elif event.type == "tool_call":
-                            name = event.data.get("name", "")
-                            args = event.data.get("args", "")
-                            sys.stdout.write(f"\n  [PageIndex] {name}({args})\n")
-                            sys.stdout.flush()
-                    sys.stdout.write("\n")
-                    sys.stdout.flush()
-                finally:
-                    done.set()
-
-            # Run streaming in a separate thread with its own event loop
-            def _run():
-                loop = asyncio.new_event_loop()
-                loop.run_until_complete(_consume())
-                loop.close()
-
-            t = threading.Thread(target=_run, daemon=True)
-            t.start()
-            t.join(timeout=120)
-            return "".join(collected) if collected else "No answer from PageIndex."
-        except Exception as exc:
-            return f"Error querying cloud PageIndex: {exc}"
-
-    # Local doc: use local PageIndex with structure-based retrieval
-    client = PageIndexClient(model=model, storage_path=openkb_dir)
-    col = client.collection()
-
-    try:
-        structure = col.get_document_structure(doc_id)
-    except Exception as exc:
-        return f"Error retrieving document structure: {exc}"
-
-    if not structure:
-        return "No structure found for document."
-    sections = []
-    for idx, node in enumerate(structure):
-        title = node.get("title", f"Section {idx + 1}")
-        node_id = node.get("node_id", str(idx))
-        summary = node.get("summary", "")
-        start = node.get("start_index", idx)
-        end = node.get("end_index", idx)
-        sections.append(
-            f"node_id={node_id} title='{title}' pages={start}-{end} summary='{summary}'"
-        )
-
-    sections_text = "\n".join(sections)
-    prompt = (
-        f"Given the following document sections:\n{sections_text}\n\n"
-        f"Which page ranges are most relevant to this question: '{question}'?\n"
-        "Reply with a comma-separated list of page numbers or ranges (e.g. '1-3,7,10-12'). "
-        "Return ONLY the page specification, nothing else."
-    )
-
-    # 2. Ask LLM which pages are relevant
-    try:
-        response = litellm.completion(
-            model=model,
-            messages=[{"role": "user", "content": prompt}],
-        )
-        page_spec = response.choices[0].message.content.strip()
-    except Exception as exc:
-        return f"Error selecting relevant pages: {exc}"
-
-    if not page_spec:
-        return "Could not determine relevant pages."
-
-    # 3. Fetch those pages
-    try:
-        pages = col.get_page_content(doc_id, page_spec)
-    except Exception as exc:
-        return f"Error fetching page content: {exc}"
-
-    if not pages:
-        return f"No content found for pages: {page_spec}"
-
-    parts = []
-    for item in pages:
-        page_num = item.get("page_index", "?")
-        text = item.get("text", "")
-        parts.append(f"[Page {page_num}]\n{text}")
-
-    return "\n\n".join(parts)
-
-
-def build_query_agent(wiki_root: str, openkb_dir: str, model: str, language: str = "en") -> Agent:
-    """Build and return the Q&A agent.
-
-    Args:
-        wiki_root: Absolute path to the wiki directory.
-        openkb_dir: Path to the .openkb/ state directory.
-        model: LLM model name.
-        language: Language code for wiki content (e.g. 'en', 'fr').
-
-    Returns:
-        Configured :class:`~agents.Agent` instance.
-    """
+def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent:
+    """Build and return the Q&A agent."""
     schema_md = get_agents_md(Path(wiki_root))
     instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md)
     instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language."
 
     @function_tool
     def list_files(directory: str) -> str:
         """List all Markdown files in a wiki subdirectory.
-
         Args:
             directory: Subdirectory path relative to wiki root (e.g. 'sources').
         """
@@ -172,31 +46,29 @@ def list_files(directory: str) -> str:
     @function_tool
     def read_file(path: str) -> str:
         """Read a Markdown file from the wiki.
-
         Args:
             path: File path relative to wiki root (e.g. 'summaries/paper.md').
         """
         return read_wiki_file(path, wiki_root)
 
     @function_tool
-    def pageindex_retrieve(doc_id: str, question: str) -> str:
-        """Retrieve relevant content from a long document via PageIndex.
-
-        Use this when you need detailed content from a document that was
-        indexed with PageIndex (long documents).
-
+    def get_page_content_tool(doc_name: str, pages: str) -> str:
+        """Get text content of specific pages from a long document.
+        Use this when you need detailed content from a document. The summary
+        page shows chapter structure with page ranges.
         Args:
-            doc_id: PageIndex document identifier (found in index.md).
-            question: The question you are trying to answer.
+            doc_name: Document name (e.g. 'attention-is-all-you-need').
+            pages: Page specification (e.g. '3-5,7,10-12').
         """
-        return _pageindex_retrieve_impl(doc_id, question, openkb_dir, model)
+        from openkb.agent.tools import get_page_content
+        return get_page_content(doc_name, pages, wiki_root)
 
     from agents.model_settings import ModelSettings
 
     return Agent(
         name="wiki-query",
         instructions=instructions,
-        tools=[list_files, read_file, pageindex_retrieve],
+        tools=[list_files, read_file, get_page_content_tool],
         model=f"litellm/{model}",
         model_settings=ModelSettings(parallel_tool_calls=False),
     )
@@ -224,9 +96,8 @@ async def run_query(question: str, kb_dir: Path, model: str, stream: bool = Fals
     language: str = config.get("language", "en")
 
     wiki_root = str(kb_dir / "wiki")
-    openkb_path = str(openkb_dir)
 
-    agent = build_query_agent(wiki_root, openkb_path, model, language=language)
+    agent = build_query_agent(wiki_root, model, language=language)
 
     if not stream:
         result = await Runner.run(agent, question)
diff --git a/openkb/schema.py b/openkb/schema.py
@@ -6,7 +6,7 @@
 # Wiki Schema
 
 ## Directory Structure
-- sources/ — Full-text converted from raw documents. Do not modify directly.
+- sources/ — Document content. Short docs as .md, long docs as .json (per-page). Do not modify directly.
 - sources/images/ — Extracted images from documents, referenced by sources.
 - summaries/ — One per source document. Summary of key content.
 - concepts/ — Cross-document topic synthesis. Created when a theme spans multiple documents.
diff --git a/tests/test_query.py b/tests/test_query.py
@@ -6,119 +6,41 @@
 
 import pytest
 
-from openkb.agent.query import _pageindex_retrieve_impl, build_query_agent, run_query
+from openkb.agent.query import build_query_agent, run_query
 from openkb.schema import SCHEMA_MD
 
 
 class TestBuildQueryAgent:
     def test_agent_name(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         assert agent.name == "wiki-query"
 
     def test_agent_has_three_tools(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         assert len(agent.tools) == 3
 
     def test_agent_tool_names(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         names = {t.name for t in agent.tools}
         assert "list_files" in names
         assert "read_file" in names
-        assert "pageindex_retrieve" in names
+        assert "get_page_content_tool" in names
+        assert "pageindex_retrieve" not in names
 
-    def test_instructions_reference_registered_pageindex_tool(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
-        tool_names = {t.name for t in agent.tools}
-        assert "pageindex_retrieve" in agent.instructions
-        assert "pageindex_retrieve" in tool_names
+    def test_instructions_mention_get_page_content(self, tmp_path):
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
+        assert "get_page_content" in agent.instructions
+        assert "pageindex_retrieve" not in agent.instructions
 
     def test_schema_in_instructions(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini")
+        agent = build_query_agent(str(tmp_path), "gpt-4o-mini")
         assert SCHEMA_MD in agent.instructions
 
     def test_agent_model(self, tmp_path):
-        agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "my-model")
+        agent = build_query_agent(str(tmp_path), "my-model")
         assert agent.model == "litellm/my-model"
 
 
-class TestPageindexRetrieve:
-    def test_returns_page_content(self, tmp_path):
-        mock_structure = [
-            {
-                "node_id": "n1",
-                "title": "Introduction",
-                "start_index": 1,
-                "end_index": 5,
-                "summary": "Overview section",
-            }
-        ]
-        mock_pages = [
-            {"page_index": 1, "text": "Introduction text here."},
-            {"page_index": 2, "text": "More intro content."},
-        ]
-
-        mock_col = MagicMock()
-        mock_col.get_document_structure.return_value = mock_structure
-        mock_col.get_page_content.return_value = mock_pages
-
-        mock_client = MagicMock()
-        mock_client.collection.return_value = mock_col
-
-        with patch("openkb.agent.query.PageIndexClient", return_value=mock_client), \
-             patch("openkb.agent.query.litellm.completion") as mock_llm, \
-             patch.dict("os.environ", {"PAGEINDEX_API_KEY": ""}, clear=False):
-            mock_llm.return_value = MagicMock(
-                choices=[MagicMock(message=MagicMock(content="1-2"))]
-            )
-            result = _pageindex_retrieve_impl("doc123", "What is the intro?", "/db", "gpt-4o-mini")
-
-        assert "Introduction text here." in result
-        assert "More intro content." in result
-
-    def test_cloud_doc_uses_streaming_query(self, tmp_path):
-        """Cloud doc (pi- prefix) delegates to col.query(stream=True)."""
-        from dataclasses import dataclass
-        from typing import Any
-
-        @dataclass
-        class FakeEvent:
-            type: str
-            data: Any
-
-        class FakeStream:
-            async def __aiter__(self):
-                yield FakeEvent(type="answer_delta", data="Cloud ")
-                yield FakeEvent(type="answer_delta", data="answer about MCP.")
-
-        mock_stream = FakeStream()
-
-        mock_col = MagicMock()
-        mock_col.query.return_value = mock_stream
-
-        mock_client = MagicMock()
-        mock_client.collection.return_value = mock_col
-
-        with patch("openkb.agent.query.PageIndexClient", return_value=mock_client):
-            result = _pageindex_retrieve_impl("pi-abc123", "What is MCP?", "/db", "gpt-4o-mini")
-
-        assert "Cloud answer about MCP." in result
-        mock_col.query.assert_called_once_with("What is MCP?", doc_ids=["pi-abc123"], stream=True)
-
-    def test_local_empty_structure_returns_error(self, tmp_path):
-        """Local doc with empty structure returns error."""
-        mock_col = MagicMock()
-        mock_col.get_document_structure.return_value = []
-
-        mock_client = MagicMock()
-        mock_client.collection.return_value = mock_col
-
-        with patch("openkb.agent.query.PageIndexClient", return_value=mock_client), \
-             patch.dict("os.environ", {"PAGEINDEX_API_KEY": ""}, clear=False):
-            result = _pageindex_retrieve_impl("local-uuid-123", "What?", "/db", "gpt-4o-mini")
-
-        assert "No structure found" in result
-
-
 class TestRunQuery:
     @pytest.mark.asyncio
     async def test_run_query_returns_final_output(self, tmp_path):