feat: add get_page_content tool and parse_pages helper

KylinMountain · KylinMountain · commit 39ae5c5fa8a0 · 2026-04-09T23:33:27.000+08:00
Adds parse_pages() to expand page specs like "1-3,7" into sorted
deduplicated int lists, and get_page_content() to read per-page JSON
(sources/{doc}.json) and format output with optional image paths.
Includes path-traversal guard consistent with existing tools.
diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py
@@ -6,6 +6,7 @@
 """
 from __future__ import annotations
 
+import json as _json
 from pathlib import Path
 
 
@@ -52,6 +53,86 @@ def read_wiki_file(path: str, wiki_root: str) -> str:
     return full_path.read_text(encoding="utf-8")
 
 
+def parse_pages(pages: str) -> list[int]:
+    """Parse a page specification string into a sorted, deduplicated list of page numbers.
+
+    Args:
+        pages: Page spec such as ``"3-5,7,10-12"``.
+
+    Returns:
+        Sorted list of positive page numbers, e.g. ``[3, 4, 5, 7, 10, 11, 12]``.
+    """
+    result: set[int] = set()
+    for part in pages.split(","):
+        part = part.strip()
+        if "-" in part:
+            # Handle ranges like "3-5"; also handle negative numbers by only
+            # splitting on the first "-" that follows a digit.
+            segments = part.split("-")
+            # Re-join to handle leading negatives: segments[0] may be empty
+            # if part starts with "-".  We just try to parse start/end.
+            try:
+                if len(segments) == 2:
+                    start, end = int(segments[0]), int(segments[1])
+                    result.update(range(start, end + 1))
+                elif len(segments) == 3 and segments[0] == "":
+                    # e.g. "-1" split gives ['', '1']
+                    result.add(-int(segments[1]))
+                # More complex cases (e.g. negative range) are ignored.
+            except ValueError:
+                pass
+        else:
+            try:
+                result.add(int(part))
+            except ValueError:
+                pass
+    return sorted(n for n in result if n > 0)
+
+
+def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str:
+    """Return formatted content for specified pages of a document.
+
+    Reads ``{wiki_root}/sources/{doc_name}.json`` which must be a JSON array of
+    objects with at least ``{"page": int, "content": str}`` fields and an
+    optional ``"images"`` list of ``{"path": str, ...}`` objects.
+
+    Args:
+        doc_name: Document name without extension (e.g. ``"paper"``).
+        pages: Page specification string (e.g. ``"1-3,7"``).
+        wiki_root: Absolute path to the wiki root directory.
+
+    Returns:
+        Formatted page content, or an error message string.
+    """
+    root = Path(wiki_root).resolve()
+    target = (root / "sources" / f"{doc_name}.json").resolve()
+    if not target.is_relative_to(root):
+        return "Access denied: path escapes wiki root."
+    if not target.exists():
+        return f"File not found: sources/{doc_name}.json"
+
+    data = _json.loads(target.read_text(encoding="utf-8"))
+    requested = set(parse_pages(pages))
+    matches = [entry for entry in data if entry.get("page") in requested]
+
+    if not matches:
+        return f"No content found for pages {pages} in {doc_name}."
+
+    parts: list[str] = []
+    for entry in matches:
+        page_num = entry["page"]
+        content = entry.get("content", "")
+        block = f"[Page {page_num}]\n{content}"
+        images = entry.get("images")
+        if images:
+            paths = ", ".join(img["path"] for img in images if "path" in img)
+            if paths:
+                block += f"\n[Images: {paths}]"
+        parts.append(block)
+
+    return "\n\n".join(parts) + "\n\n"
+
+
 def write_wiki_file(path: str, content: str, wiki_root: str) -> str:
     """Write or overwrite a Markdown file in the wiki.
 
diff --git a/tests/test_agent_tools.py b/tests/test_agent_tools.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from openkb.agent.tools import list_wiki_files, read_wiki_file, write_wiki_file
+from openkb.agent.tools import get_page_content, list_wiki_files, parse_pages, read_wiki_file, write_wiki_file
 
 
 # ---------------------------------------------------------------------------
@@ -128,3 +128,88 @@ def test_returns_written_path(self, tmp_path):
         result = write_wiki_file("reports/health.md", "All good.", wiki_root)
 
         assert result == "Written: reports/health.md"
+
+
+# ---------------------------------------------------------------------------
+# parse_pages
+# ---------------------------------------------------------------------------
+
+
+class TestParsePages:
+    def test_single_page(self):
+        assert parse_pages("3") == [3]
+
+    def test_range(self):
+        assert parse_pages("3-5") == [3, 4, 5]
+
+    def test_comma_separated(self):
+        assert parse_pages("1,3,5") == [1, 3, 5]
+
+    def test_mixed(self):
+        assert parse_pages("1-3,7,10-12") == [1, 2, 3, 7, 10, 11, 12]
+
+    def test_deduplication(self):
+        assert parse_pages("3,3,3") == [3]
+
+    def test_sorted(self):
+        assert parse_pages("5,1,3") == [1, 3, 5]
+
+    def test_ignores_zero_and_negative(self):
+        assert parse_pages("0,-1,3") == [3]
+
+
+# ---------------------------------------------------------------------------
+# get_page_content
+# ---------------------------------------------------------------------------
+
+
+class TestGetPageContent:
+    def test_reads_pages_from_json(self, tmp_path):
+        import json
+        wiki_root = str(tmp_path)
+        sources = tmp_path / "sources"
+        sources.mkdir()
+        pages = [
+            {"page": 1, "content": "Page one text."},
+            {"page": 2, "content": "Page two text."},
+            {"page": 3, "content": "Page three text."},
+        ]
+        (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8")
+        result = get_page_content("paper", "1,3", wiki_root)
+        assert "[Page 1]" in result
+        assert "Page one text." in result
+        assert "[Page 3]" in result
+        assert "Page three text." in result
+        assert "Page two" not in result
+
+    def test_returns_error_for_missing_file(self, tmp_path):
+        wiki_root = str(tmp_path)
+        (tmp_path / "sources").mkdir()
+        result = get_page_content("nonexistent", "1", wiki_root)
+        assert "not found" in result.lower()
+
+    def test_returns_error_for_no_matching_pages(self, tmp_path):
+        import json
+        wiki_root = str(tmp_path)
+        sources = tmp_path / "sources"
+        sources.mkdir()
+        pages = [{"page": 1, "content": "Only page."}]
+        (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8")
+        result = get_page_content("paper", "99", wiki_root)
+        assert "no content" in result.lower()
+
+    def test_includes_images_info(self, tmp_path):
+        import json
+        wiki_root = str(tmp_path)
+        sources = tmp_path / "sources"
+        sources.mkdir()
+        pages = [{"page": 1, "content": "Text.", "images": [{"path": "images/p/img.png", "width": 100, "height": 80}]}]
+        (sources / "doc.json").write_text(json.dumps(pages), encoding="utf-8")
+        result = get_page_content("doc", "1", wiki_root)
+        assert "img.png" in result
+
+    def test_path_escape_denied(self, tmp_path):
+        wiki_root = str(tmp_path)
+        (tmp_path / "sources").mkdir()
+        result = get_page_content("../../etc/passwd", "1", wiki_root)
+        assert "denied" in result.lower() or "not found" in result.lower()