33
44from pathlib import Path
55
6- import litellm
76from agents import Agent , Runner , function_tool
8- import os
9-
10- from pageindex import PageIndexClient
117
128from openkb .agent .tools import list_wiki_files , read_wiki_file
139from openkb .schema import SCHEMA_MD , get_agents_md
1814{schema_md}
1915
2016## Search strategy
21- 1. Start by reading index.md to understand what documents and concepts are available.
22- 2. Read relevant summary pages (summaries/) to get document overviews.
17+ 1. Read index.md to understand what documents and concepts are available.
18+ Each entry has a brief summary to help you judge relevance.
19+ 2. Read relevant summary pages (summaries/) for document overviews.
23203. Read concept pages (concepts/) for cross-document synthesis.
24- 4. For long documents indexed with PageIndex, call pageindex_retrieve with the
25- document ID and the user's question to get detailed page-level content.
21+ 4. For long documents, use get_page_content(doc_name, pages) to read
22+ specific pages when you need detailed content. The summary page
23+ shows chapter structure with page ranges to help you decide which
24+ pages to read.
26255. Synthesise a clear, well-cited answer.
2726
2827Always ground your answer in the wiki content. If you cannot find relevant
2928information, say so clearly.
3029"""
3130
3231
33- def _pageindex_retrieve_impl (doc_id : str , question : str , openkb_dir : str , model : str ) -> str :
34- """Retrieve relevant content from a long document via PageIndex.
35-
36- For cloud-indexed docs: delegates to col.query() directly.
37- For local docs: uses structure-based page selection + get_page_content.
38- """
39- pageindex_api_key = os .environ .get ("PAGEINDEX_API_KEY" , "" )
40- # Determine if this doc was cloud-indexed (cloud doc_ids have "pi-" prefix)
41- is_cloud_doc = doc_id .startswith ("pi-" )
42-
43- if is_cloud_doc :
44- # Cloud doc: use PageIndex streaming query (avoids timeout, shows progress)
45- import sys
46- import asyncio
47- import threading
48-
49- client = PageIndexClient (api_key = pageindex_api_key or None , model = model )
50- col = client .collection ()
51- try :
52- stream = col .query (question , doc_ids = [doc_id ], stream = True )
53- collected : list [str ] = []
54- done = threading .Event ()
55-
56- async def _consume ():
57- try :
58- async for event in stream :
59- if event .type == "answer_delta" :
60- sys .stdout .write (event .data )
61- sys .stdout .flush ()
62- collected .append (event .data )
63- elif event .type == "tool_call" :
64- name = event .data .get ("name" , "" )
65- args = event .data .get ("args" , "" )
66- sys .stdout .write (f"\n [PageIndex] { name } ({ args } )\n " )
67- sys .stdout .flush ()
68- sys .stdout .write ("\n " )
69- sys .stdout .flush ()
70- finally :
71- done .set ()
72-
73- # Run streaming in a separate thread with its own event loop
74- def _run ():
75- loop = asyncio .new_event_loop ()
76- loop .run_until_complete (_consume ())
77- loop .close ()
78-
79- t = threading .Thread (target = _run , daemon = True )
80- t .start ()
81- t .join (timeout = 120 )
82- return "" .join (collected ) if collected else "No answer from PageIndex."
83- except Exception as exc :
84- return f"Error querying cloud PageIndex: { exc } "
85-
86- # Local doc: use local PageIndex with structure-based retrieval
87- client = PageIndexClient (model = model , storage_path = openkb_dir )
88- col = client .collection ()
89-
90- try :
91- structure = col .get_document_structure (doc_id )
92- except Exception as exc :
93- return f"Error retrieving document structure: { exc } "
94-
95- if not structure :
96- return "No structure found for document."
97- sections = []
98- for idx , node in enumerate (structure ):
99- title = node .get ("title" , f"Section { idx + 1 } " )
100- node_id = node .get ("node_id" , str (idx ))
101- summary = node .get ("summary" , "" )
102- start = node .get ("start_index" , idx )
103- end = node .get ("end_index" , idx )
104- sections .append (
105- f"node_id={ node_id } title='{ title } ' pages={ start } -{ end } summary='{ summary } '"
106- )
107-
108- sections_text = "\n " .join (sections )
109- prompt = (
110- f"Given the following document sections:\n { sections_text } \n \n "
111- f"Which page ranges are most relevant to this question: '{ question } '?\n "
112- "Reply with a comma-separated list of page numbers or ranges (e.g. '1-3,7,10-12'). "
113- "Return ONLY the page specification, nothing else."
114- )
115-
116- # 2. Ask LLM which pages are relevant
117- try :
118- response = litellm .completion (
119- model = model ,
120- messages = [{"role" : "user" , "content" : prompt }],
121- )
122- page_spec = response .choices [0 ].message .content .strip ()
123- except Exception as exc :
124- return f"Error selecting relevant pages: { exc } "
125-
126- if not page_spec :
127- return "Could not determine relevant pages."
128-
129- # 3. Fetch those pages
130- try :
131- pages = col .get_page_content (doc_id , page_spec )
132- except Exception as exc :
133- return f"Error fetching page content: { exc } "
134-
135- if not pages :
136- return f"No content found for pages: { page_spec } "
137-
138- parts = []
139- for item in pages :
140- page_num = item .get ("page_index" , "?" )
141- text = item .get ("text" , "" )
142- parts .append (f"[Page { page_num } ]\n { text } " )
143-
144- return "\n \n " .join (parts )
145-
146-
147- def build_query_agent (wiki_root : str , openkb_dir : str , model : str , language : str = "en" ) -> Agent :
148- """Build and return the Q&A agent.
149-
150- Args:
151- wiki_root: Absolute path to the wiki directory.
152- openkb_dir: Path to the .openkb/ state directory.
153- model: LLM model name.
154- language: Language code for wiki content (e.g. 'en', 'fr').
155-
156- Returns:
157- Configured :class:`~agents.Agent` instance.
158- """
32+ def build_query_agent (wiki_root : str , model : str , language : str = "en" ) -> Agent :
33+ """Build and return the Q&A agent."""
15934 schema_md = get_agents_md (Path (wiki_root ))
16035 instructions = _QUERY_INSTRUCTIONS_TEMPLATE .format (schema_md = schema_md )
16136 instructions += f"\n \n IMPORTANT: Write all wiki content in { language } language."
16237
16338 @function_tool
16439 def list_files (directory : str ) -> str :
16540 """List all Markdown files in a wiki subdirectory.
166-
16741 Args:
16842 directory: Subdirectory path relative to wiki root (e.g. 'sources').
16943 """
@@ -172,31 +46,29 @@ def list_files(directory: str) -> str:
17246 @function_tool
17347 def read_file (path : str ) -> str :
17448 """Read a Markdown file from the wiki.
175-
17649 Args:
17750 path: File path relative to wiki root (e.g. 'summaries/paper.md').
17851 """
17952 return read_wiki_file (path , wiki_root )
18053
18154 @function_tool
182- def pageindex_retrieve (doc_id : str , question : str ) -> str :
183- """Retrieve relevant content from a long document via PageIndex.
184-
185- Use this when you need detailed content from a document that was
186- indexed with PageIndex (long documents).
187-
55+ def get_page_content_tool (doc_name : str , pages : str ) -> str :
56+ """Get text content of specific pages from a long document.
57+ Use this when you need detailed content from a document. The summary
58+ page shows chapter structure with page ranges.
18859 Args:
189- doc_id: PageIndex document identifier (found in index.md ).
190- question: The question you are trying to answer .
60+ doc_name: Document name (e.g. 'attention-is-all-you-need' ).
61+ pages: Page specification (e.g. '3-5,7,10-12') .
19162 """
192- return _pageindex_retrieve_impl (doc_id , question , openkb_dir , model )
63+ from openkb .agent .tools import get_page_content
64+ return get_page_content (doc_name , pages , wiki_root )
19365
19466 from agents .model_settings import ModelSettings
19567
19668 return Agent (
19769 name = "wiki-query" ,
19870 instructions = instructions ,
199- tools = [list_files , read_file , pageindex_retrieve ],
71+ tools = [list_files , read_file , get_page_content_tool ],
20072 model = f"litellm/{ model } " ,
20173 model_settings = ModelSettings (parallel_tool_calls = False ),
20274 )
@@ -224,9 +96,8 @@ async def run_query(question: str, kb_dir: Path, model: str, stream: bool = Fals
22496 language : str = config .get ("language" , "en" )
22597
22698 wiki_root = str (kb_dir / "wiki" )
227- openkb_path = str (openkb_dir )
22899
229- agent = build_query_agent (wiki_root , openkb_path , model , language = language )
100+ agent = build_query_agent (wiki_root , model , language = language )
230101
231102 if not stream :
232103 result = await Runner .run (agent , question )
0 commit comments