Skip to content

Commit cc6215a

Browse files
committed
feat: store long doc sources as per-page JSON, remove render_source_md
Replace markdown source generation with per-page JSON from PageIndex get_page_content; remove render_source_md, _render_nodes_source, _relocate_images, and _IMG_REF_RE. Image relocation is now done inline per page. Update tests to assert .json output and mock get_page_content.
1 parent 5b086a5 commit cc6215a

3 files changed

Lines changed: 41 additions & 71 deletions

File tree

openkb/indexer.py

Lines changed: 23 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
"""PageIndex indexer for long documents."""
22
from __future__ import annotations
33

4+
import json as json_mod
45
import logging
5-
import re
66
import shutil
77
from dataclasses import dataclass
88
from pathlib import Path
@@ -12,12 +12,10 @@
1212
from pageindex import IndexConfig, PageIndexClient
1313

1414
from openkb.config import load_config
15-
from openkb.tree_renderer import render_source_md, render_summary_md
15+
from openkb.tree_renderer import render_summary_md
1616

1717
logger = logging.getLogger(__name__)
1818

19-
_IMG_REF_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
20-
2119

2220
@dataclass
2321
class IndexResult:
@@ -28,31 +26,6 @@ class IndexResult:
2826
tree: dict
2927

3028

31-
def _relocate_images(markdown: str, doc_stem: str, dest_images_dir: Path) -> str:
32-
"""Copy images from PageIndex internal paths to wiki/sources/images/ and rewrite refs.
33-
34-
PageIndex stores images internally (e.g. .openkb/files/{collection}/{doc_id}/images/).
35-
We copy them to dest_images_dir and rewrite paths to be relative to the .md file
36-
(i.e. images/{doc_stem}/filename).
37-
"""
38-
dest_images_dir.mkdir(parents=True, exist_ok=True)
39-
40-
def _replace(match: re.Match) -> str:
41-
alt = match.group(1)
42-
src_path_str = match.group(2)
43-
src_path = Path(src_path_str)
44-
if not src_path.exists():
45-
logger.warning("Image not found: %s", src_path)
46-
return match.group(0)
47-
filename = src_path.name
48-
dest = dest_images_dir / filename
49-
if not dest.exists():
50-
shutil.copy2(src_path, dest)
51-
return f"![{alt}](images/{doc_stem}/{filename})"
52-
53-
return _IMG_REF_RE.sub(_replace, markdown)
54-
55-
5629
def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
5730
"""Index a long PDF document using PageIndex and write wiki pages."""
5831
openkb_dir = kb_dir / ".openkb"
@@ -100,14 +73,30 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
10073
"structure": structure,
10174
}
10275

103-
# Write wiki/sources/ — copy images from PageIndex internal location
104-
# and rewrite paths to be relative to the .md file (images/{stem}/filename)
76+
# Write wiki/sources/ — get per-page content from PageIndex and store as JSON
10577
sources_dir = kb_dir / "wiki" / "sources"
10678
sources_dir.mkdir(parents=True, exist_ok=True)
10779
dest_images_dir = sources_dir / "images" / pdf_path.stem
108-
source_md = render_source_md(tree, doc_name, doc_id)
109-
source_md = _relocate_images(source_md, pdf_path.stem, dest_images_dir)
110-
(sources_dir / f"{pdf_path.stem}.md").write_text(source_md, encoding="utf-8")
80+
81+
# Get per-page content from PageIndex
82+
all_pages = col.get_page_content(doc_id, f"1-{doc.get('page_count', 9999)}")
83+
84+
# Relocate image paths in each page
85+
dest_images_dir.mkdir(parents=True, exist_ok=True)
86+
for page in all_pages:
87+
if "images" in page:
88+
for img in page["images"]:
89+
src_path = Path(img["path"])
90+
if src_path.exists():
91+
filename = src_path.name
92+
dest = dest_images_dir / filename
93+
if not dest.exists():
94+
shutil.copy2(src_path, dest)
95+
img["path"] = f"images/{pdf_path.stem}/{filename}"
96+
97+
(sources_dir / f"{pdf_path.stem}.json").write_text(
98+
json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",
99+
)
111100

112101
# Write wiki/summaries/ (no images, just summaries)
113102
summaries_dir = kb_dir / "wiki" / "summaries"

openkb/tree_renderer.py

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,6 @@ def _yaml_frontmatter(source_name: str, doc_id: str) -> str:
1313
)
1414

1515

16-
def _render_nodes_source(nodes: list[dict], depth: int) -> str:
17-
"""Recursively render nodes for the *source* view (text content)."""
18-
lines: list[str] = []
19-
heading_prefix = "#" * min(depth, 6)
20-
for node in nodes:
21-
title = node.get("title", "")
22-
start = node.get("start_index", "")
23-
end = node.get("end_index", "")
24-
text = node.get("text", "")
25-
children = node.get("nodes", [])
26-
27-
lines.append(f"{heading_prefix} {title} (pages {start}\u2013{end})\n")
28-
if text:
29-
lines.append(f"{text}\n")
30-
if children:
31-
lines.append(_render_nodes_source(children, depth + 1))
32-
33-
return "\n".join(lines)
34-
3516

3617
def _render_nodes_summary(nodes: list[dict], depth: int) -> str:
3718
"""Recursively render nodes for the *summary* view (summaries only)."""
@@ -53,18 +34,6 @@ def _render_nodes_summary(nodes: list[dict], depth: int) -> str:
5334
return "\n".join(lines)
5435

5536

56-
def render_source_md(tree: dict, source_name: str, doc_id: str) -> str:
57-
"""Render the full-text (source) Markdown page for a PageIndex tree.
58-
59-
The page begins with YAML frontmatter, then recursively renders
60-
every node as a heading with its ``(pages X–Y)`` range and full text.
61-
Heading level equals tree depth (h1 at root), capped at h6.
62-
"""
63-
frontmatter = _yaml_frontmatter(source_name, doc_id)
64-
structure = tree.get("structure", [])
65-
body = _render_nodes_source(structure, depth=1)
66-
return frontmatter + "\n" + body
67-
6837

6938
def render_summary_md(tree: dict, source_name: str, doc_id: str) -> str:
7039
"""Render the summary Markdown page for a PageIndex tree.

tests/test_indexer.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ def _make_fake_collection(self, doc_id: str, sample_tree: dict):
2323
"doc_type": "pdf",
2424
"structure": sample_tree["structure"],
2525
}
26+
27+
# get_page_content returns empty list by default (overridden per test as needed)
28+
col.get_page_content.return_value = []
2629
return col
2730

2831
def test_returns_index_result(self, kb_dir, sample_tree, tmp_path):
@@ -43,24 +46,33 @@ def test_returns_index_result(self, kb_dir, sample_tree, tmp_path):
4346
assert result.description == sample_tree["doc_description"]
4447
assert result.tree is not None
4548

46-
def test_source_page_written(self, kb_dir, sample_tree, tmp_path):
49+
def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path):
50+
"""Long doc source should be written as JSON, not markdown."""
51+
import json as json_mod
4752
doc_id = "abc-123"
4853
fake_col = self._make_fake_collection(doc_id, sample_tree)
4954

5055
fake_client = MagicMock()
5156
fake_client.collection.return_value = fake_col
57+
# Mock get_page_content to return page data
58+
fake_col.get_page_content.return_value = [
59+
{"page": 1, "content": "Page one text."},
60+
{"page": 2, "content": "Page two text."},
61+
]
5262

5363
pdf_path = tmp_path / "sample.pdf"
5464
pdf_path.write_bytes(b"%PDF-1.4 fake")
5565

5666
with patch("openkb.indexer.PageIndexClient", return_value=fake_client):
5767
index_long_document(pdf_path, kb_dir)
5868

59-
source_file = kb_dir / "wiki" / "sources" / "sample.md"
60-
assert source_file.exists()
61-
content = source_file.read_text(encoding="utf-8")
62-
assert "type: pageindex" in content
63-
assert "Introduction" in content
69+
json_file = kb_dir / "wiki" / "sources" / "sample.json"
70+
assert json_file.exists()
71+
assert not (kb_dir / "wiki" / "sources" / "sample.md").exists()
72+
data = json_mod.loads(json_file.read_text())
73+
assert len(data) == 2
74+
assert data[0]["page"] == 1
75+
assert data[0]["content"] == "Page one text."
6476

6577
def test_summary_page_written(self, kb_dir, sample_tree, tmp_path):
6678
doc_id = "abc-123"

0 commit comments

Comments
 (0)