feat: store long doc sources as per-page JSON, remove render_source_md

KylinMountain · KylinMountain · commit cc6215a23b93 · 2026-04-09T23:42:39.000+08:00
Replace markdown source generation with per-page JSON from PageIndex
get_page_content; remove render_source_md, _render_nodes_source,
_relocate_images, and _IMG_REF_RE. Image relocation is now done inline
per page. Update tests to assert .json output and mock get_page_content.
diff --git a/openkb/indexer.py b/openkb/indexer.py
@@ -1,8 +1,8 @@
 """PageIndex indexer for long documents."""
 from __future__ import annotations
 
+import json as json_mod
 import logging
-import re
 import shutil
 from dataclasses import dataclass
 from pathlib import Path
@@ -12,12 +12,10 @@
 from pageindex import IndexConfig, PageIndexClient
 
 from openkb.config import load_config
-from openkb.tree_renderer import render_source_md, render_summary_md
+from openkb.tree_renderer import render_summary_md
 
 logger = logging.getLogger(__name__)
 
-_IMG_REF_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
-
 
 @dataclass
 class IndexResult:
@@ -28,31 +26,6 @@ class IndexResult:
     tree: dict
 
 
-def _relocate_images(markdown: str, doc_stem: str, dest_images_dir: Path) -> str:
-    """Copy images from PageIndex internal paths to wiki/sources/images/ and rewrite refs.
-
-    PageIndex stores images internally (e.g. .openkb/files/{collection}/{doc_id}/images/).
-    We copy them to dest_images_dir and rewrite paths to be relative to the .md file
-    (i.e. images/{doc_stem}/filename).
-    """
-    dest_images_dir.mkdir(parents=True, exist_ok=True)
-
-    def _replace(match: re.Match) -> str:
-        alt = match.group(1)
-        src_path_str = match.group(2)
-        src_path = Path(src_path_str)
-        if not src_path.exists():
-            logger.warning("Image not found: %s", src_path)
-            return match.group(0)
-        filename = src_path.name
-        dest = dest_images_dir / filename
-        if not dest.exists():
-            shutil.copy2(src_path, dest)
-        return f"![{alt}](images/{doc_stem}/{filename})"
-
-    return _IMG_REF_RE.sub(_replace, markdown)
-
-
 def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
     """Index a long PDF document using PageIndex and write wiki pages."""
     openkb_dir = kb_dir / ".openkb"
@@ -100,14 +73,30 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
         "structure": structure,
     }
 
-    # Write wiki/sources/ — copy images from PageIndex internal location
-    # and rewrite paths to be relative to the .md file (images/{stem}/filename)
+    # Write wiki/sources/ — get per-page content from PageIndex and store as JSON
     sources_dir = kb_dir / "wiki" / "sources"
     sources_dir.mkdir(parents=True, exist_ok=True)
     dest_images_dir = sources_dir / "images" / pdf_path.stem
-    source_md = render_source_md(tree, doc_name, doc_id)
-    source_md = _relocate_images(source_md, pdf_path.stem, dest_images_dir)
-    (sources_dir / f"{pdf_path.stem}.md").write_text(source_md, encoding="utf-8")
+
+    # Get per-page content from PageIndex
+    all_pages = col.get_page_content(doc_id, f"1-{doc.get('page_count', 9999)}")
+
+    # Relocate image paths in each page
+    dest_images_dir.mkdir(parents=True, exist_ok=True)
+    for page in all_pages:
+        if "images" in page:
+            for img in page["images"]:
+                src_path = Path(img["path"])
+                if src_path.exists():
+                    filename = src_path.name
+                    dest = dest_images_dir / filename
+                    if not dest.exists():
+                        shutil.copy2(src_path, dest)
+                    img["path"] = f"images/{pdf_path.stem}/{filename}"
+
+    (sources_dir / f"{pdf_path.stem}.json").write_text(
+        json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",
+    )
 
     # Write wiki/summaries/ (no images, just summaries)
     summaries_dir = kb_dir / "wiki" / "summaries"
diff --git a/openkb/tree_renderer.py b/openkb/tree_renderer.py
@@ -13,25 +13,6 @@ def _yaml_frontmatter(source_name: str, doc_id: str) -> str:
     )
 
 
-def _render_nodes_source(nodes: list[dict], depth: int) -> str:
-    """Recursively render nodes for the *source* view (text content)."""
-    lines: list[str] = []
-    heading_prefix = "#" * min(depth, 6)
-    for node in nodes:
-        title = node.get("title", "")
-        start = node.get("start_index", "")
-        end = node.get("end_index", "")
-        text = node.get("text", "")
-        children = node.get("nodes", [])
-
-        lines.append(f"{heading_prefix} {title} (pages {start}\u2013{end})\n")
-        if text:
-            lines.append(f"{text}\n")
-        if children:
-            lines.append(_render_nodes_source(children, depth + 1))
-
-    return "\n".join(lines)
-
 
 def _render_nodes_summary(nodes: list[dict], depth: int) -> str:
     """Recursively render nodes for the *summary* view (summaries only)."""
@@ -53,18 +34,6 @@ def _render_nodes_summary(nodes: list[dict], depth: int) -> str:
     return "\n".join(lines)
 
 
-def render_source_md(tree: dict, source_name: str, doc_id: str) -> str:
-    """Render the full-text (source) Markdown page for a PageIndex tree.
-
-    The page begins with YAML frontmatter, then recursively renders
-    every node as a heading with its ``(pages X–Y)`` range and full text.
-    Heading level equals tree depth (h1 at root), capped at h6.
-    """
-    frontmatter = _yaml_frontmatter(source_name, doc_id)
-    structure = tree.get("structure", [])
-    body = _render_nodes_source(structure, depth=1)
-    return frontmatter + "\n" + body
-
 
 def render_summary_md(tree: dict, source_name: str, doc_id: str) -> str:
     """Render the summary Markdown page for a PageIndex tree.
diff --git a/tests/test_indexer.py b/tests/test_indexer.py
@@ -23,6 +23,9 @@ def _make_fake_collection(self, doc_id: str, sample_tree: dict):
             "doc_type": "pdf",
             "structure": sample_tree["structure"],
         }
+
+        # get_page_content returns empty list by default (overridden per test as needed)
+        col.get_page_content.return_value = []
         return col
 
     def test_returns_index_result(self, kb_dir, sample_tree, tmp_path):
@@ -43,24 +46,33 @@ def test_returns_index_result(self, kb_dir, sample_tree, tmp_path):
         assert result.description == sample_tree["doc_description"]
         assert result.tree is not None
 
-    def test_source_page_written(self, kb_dir, sample_tree, tmp_path):
+    def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path):
+        """Long doc source should be written as JSON, not markdown."""
+        import json as json_mod
         doc_id = "abc-123"
         fake_col = self._make_fake_collection(doc_id, sample_tree)
 
         fake_client = MagicMock()
         fake_client.collection.return_value = fake_col
+        # Mock get_page_content to return page data
+        fake_col.get_page_content.return_value = [
+            {"page": 1, "content": "Page one text."},
+            {"page": 2, "content": "Page two text."},
+        ]
 
         pdf_path = tmp_path / "sample.pdf"
         pdf_path.write_bytes(b"%PDF-1.4 fake")
 
         with patch("openkb.indexer.PageIndexClient", return_value=fake_client):
             index_long_document(pdf_path, kb_dir)
 
-        source_file = kb_dir / "wiki" / "sources" / "sample.md"
-        assert source_file.exists()
-        content = source_file.read_text(encoding="utf-8")
-        assert "type: pageindex" in content
-        assert "Introduction" in content
+        json_file = kb_dir / "wiki" / "sources" / "sample.json"
+        assert json_file.exists()
+        assert not (kb_dir / "wiki" / "sources" / "sample.md").exists()
+        data = json_mod.loads(json_file.read_text())
+        assert len(data) == 2
+        assert data[0]["page"] == 1
+        assert data[0]["content"] == "Page one text."
 
     def test_summary_page_written(self, kb_dir, sample_tree, tmp_path):
         doc_id = "abc-123"