11"""PageIndex indexer for long documents."""
22from __future__ import annotations
33
4+ import json as json_mod
45import logging
5- import re
66import shutil
77from dataclasses import dataclass
88from pathlib import Path
1212from pageindex import IndexConfig , PageIndexClient
1313
1414from openkb .config import load_config
15- from openkb .tree_renderer import render_source_md , render_summary_md
15+ from openkb .tree_renderer import render_summary_md
1616
1717logger = logging .getLogger (__name__ )
1818
19- _IMG_REF_RE = re .compile (r"!\[([^\]]*)\]\(([^)]+)\)" )
20-
2119
2220@dataclass
2321class IndexResult :
@@ -28,31 +26,6 @@ class IndexResult:
2826 tree : dict
2927
3028
31- def _relocate_images (markdown : str , doc_stem : str , dest_images_dir : Path ) -> str :
32- """Copy images from PageIndex internal paths to wiki/sources/images/ and rewrite refs.
33-
34- PageIndex stores images internally (e.g. .openkb/files/{collection}/{doc_id}/images/).
35- We copy them to dest_images_dir and rewrite paths to be relative to the .md file
36- (i.e. images/{doc_stem}/filename).
37- """
38- dest_images_dir .mkdir (parents = True , exist_ok = True )
39-
40- def _replace (match : re .Match ) -> str :
41- alt = match .group (1 )
42- src_path_str = match .group (2 )
43- src_path = Path (src_path_str )
44- if not src_path .exists ():
45- logger .warning ("Image not found: %s" , src_path )
46- return match .group (0 )
47- filename = src_path .name
48- dest = dest_images_dir / filename
49- if not dest .exists ():
50- shutil .copy2 (src_path , dest )
51- return f""
52-
53- return _IMG_REF_RE .sub (_replace , markdown )
54-
55-
5629def index_long_document (pdf_path : Path , kb_dir : Path ) -> IndexResult :
5730 """Index a long PDF document using PageIndex and write wiki pages."""
5831 openkb_dir = kb_dir / ".openkb"
@@ -100,14 +73,30 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
10073 "structure" : structure ,
10174 }
10275
103- # Write wiki/sources/ — copy images from PageIndex internal location
104- # and rewrite paths to be relative to the .md file (images/{stem}/filename)
76+ # Write wiki/sources/ — get per-page content from PageIndex and store as JSON
10577 sources_dir = kb_dir / "wiki" / "sources"
10678 sources_dir .mkdir (parents = True , exist_ok = True )
10779 dest_images_dir = sources_dir / "images" / pdf_path .stem
108- source_md = render_source_md (tree , doc_name , doc_id )
109- source_md = _relocate_images (source_md , pdf_path .stem , dest_images_dir )
110- (sources_dir / f"{ pdf_path .stem } .md" ).write_text (source_md , encoding = "utf-8" )
80+
81+ # Get per-page content from PageIndex
82+ all_pages = col .get_page_content (doc_id , f"1-{ doc .get ('page_count' , 9999 )} " )
83+
84+ # Relocate image paths in each page
85+ dest_images_dir .mkdir (parents = True , exist_ok = True )
86+ for page in all_pages :
87+ if "images" in page :
88+ for img in page ["images" ]:
89+ src_path = Path (img ["path" ])
90+ if src_path .exists ():
91+ filename = src_path .name
92+ dest = dest_images_dir / filename
93+ if not dest .exists ():
94+ shutil .copy2 (src_path , dest )
95+ img ["path" ] = f"images/{ pdf_path .stem } /{ filename } "
96+
97+ (sources_dir / f"{ pdf_path .stem } .json" ).write_text (
98+ json_mod .dumps (all_pages , ensure_ascii = False , indent = 2 ), encoding = "utf-8" ,
99+ )
111100
112101 # Write wiki/summaries/ (no images, just summaries)
113102 summaries_dir = kb_dir / "wiki" / "summaries"
0 commit comments