Skip to content

Commit ad05577

Browse files
committed
refactor: use pymupdf for page content extraction, unify image paths
Replace PageIndex get_page_content with pymupdf-based convert_pdf_to_pages for long doc JSON generation. All image paths now use sources/images/ prefix relative to wiki root. Removes dependency on PageIndex for source content.
1 parent 5a1f014 commit ad05577

2 files changed

Lines changed: 67 additions & 39 deletions

File tree

openkb/images.py

Lines changed: 61 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,66 @@ def extract_pdf_images(pdf_path: Path, doc_name: str, images_dir: Path) -> dict[
6767
logger.warning("Failed to save image block on page %d", page_num)
6868
continue
6969

70-
rel_path = f"images/{doc_name}/{filename}"
70+
rel_path = f"sources/images/{doc_name}/{filename}"
7171
page_images.setdefault(page_num, []).append(rel_path)
7272
return page_images
7373

7474

75+
def convert_pdf_to_pages(pdf_path: Path, doc_name: str, images_dir: Path) -> list[dict]:
76+
"""Convert a PDF to per-page dicts with text content and images.
77+
78+
Each dict has ``{"page": int, "content": str, "images": [{"path": str}]}``.
79+
Images are saved to *images_dir* and referenced with wiki-root-relative paths.
80+
"""
81+
images_dir.mkdir(parents=True, exist_ok=True)
82+
pages: list[dict] = []
83+
img_counter = 0
84+
85+
with pymupdf.open(str(pdf_path)) as doc:
86+
for page_idx in range(len(doc)):
87+
page = doc[page_idx]
88+
page_num = page_idx + 1
89+
parts: list[str] = []
90+
page_images: list[dict] = []
91+
92+
for block in page.get_text("dict")["blocks"]:
93+
if block["type"] == 0: # text block
94+
lines = []
95+
for line in block["lines"]:
96+
spans_text = "".join(span["text"] for span in line["spans"])
97+
lines.append(spans_text)
98+
parts.append("\n".join(lines))
99+
100+
elif block["type"] == 1: # image block
101+
width = block.get("width", 0)
102+
height = block.get("height", 0)
103+
if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM:
104+
continue
105+
image_bytes = block.get("image")
106+
if not image_bytes:
107+
continue
108+
try:
109+
pix = pymupdf.Pixmap(image_bytes)
110+
if pix.n > 4:
111+
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
112+
img_counter += 1
113+
filename = f"p{page_num}_img{img_counter}.png"
114+
(images_dir / filename).write_bytes(pix.tobytes("png"))
115+
pix = None
116+
img_path = f"sources/images/{doc_name}/{filename}"
117+
parts.append(f"\n![image]({img_path})\n")
118+
page_images.append({"path": img_path})
119+
except Exception:
120+
logger.warning("Failed to save image block on page %d", page_num)
121+
122+
pages.append({
123+
"page": page_num,
124+
"content": "\n".join(parts),
125+
"images": page_images,
126+
})
127+
return pages
128+
129+
75130
def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) -> str:
76131
"""Convert a PDF to markdown with inline images using pymupdf dict-mode.
77132
@@ -115,7 +170,7 @@ def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) ->
115170
filename = f"p{page_num}_img{img_counter}.png"
116171
(images_dir / filename).write_bytes(pix.tobytes("png"))
117172
pix = None
118-
parts.append(f"\n![image](images/{doc_name}/{filename})\n")
173+
parts.append(f"\n![image](sources/images/{doc_name}/{filename})\n")
119174
except Exception:
120175
logger.warning("Failed to save image block on page %d", page_num)
121176
return "\n".join(parts)
@@ -126,7 +181,7 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str
126181
127182
For each ``![alt](data:image/ext;base64,DATA)`` match:
128183
- Decode base64 bytes → save to ``images_dir/img_NNN.ext``
129-
- Replace the link with ``![alt](images/{doc_name}/img_NNN.ext)``
184+
- Replace the link with ``![alt](sources/images/{doc_name}/img_NNN.ext)``
130185
- On decode failure: log a warning and leave the original text unchanged.
131186
"""
132187
counter = 0
@@ -150,7 +205,7 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str
150205
images_dir.mkdir(parents=True, exist_ok=True)
151206
dest.write_bytes(image_bytes)
152207

153-
new_ref = f"![{alt}](images/{doc_name}/{filename})"
208+
new_ref = f"![{alt}](sources/images/{doc_name}/{filename})"
154209
result = result.replace(match.group(0), new_ref, 1)
155210

156211
return result
@@ -164,7 +219,7 @@ def copy_relative_images(
164219
For each ``![alt](relative/path)`` match (skipping http/https and data URIs):
165220
- Resolve path relative to ``source_dir``
166221
- Copy to ``images_dir/{filename}``
167-
- Replace link with ``![alt](images/{doc_name}/{filename})``
222+
- Replace link with ``![alt](sources/images/{doc_name}/{filename})``
168223
- Missing source file: log a warning and leave the original text unchanged.
169224
"""
170225
result = markdown
@@ -186,7 +241,7 @@ def copy_relative_images(
186241
images_dir.mkdir(parents=True, exist_ok=True)
187242
shutil.copy2(src, dest)
188243

189-
new_ref = f"![{alt}](images/{doc_name}/{filename})"
244+
new_ref = f"![{alt}](sources/images/{doc_name}/{filename})"
190245
result = result.replace(match.group(0), new_ref, 1)
191246

192247
return result

openkb/indexer.py

Lines changed: 6 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import json as json_mod
55
import logging
6-
import shutil
6+
77
from dataclasses import dataclass
88
from pathlib import Path
99

@@ -77,40 +77,13 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
7777
"structure": structure,
7878
}
7979

80-
# Write wiki/sources/ — get per-page content from PageIndex and store as JSON
80+
# Write wiki/sources/ — extract per-page content with pymupdf (not PageIndex)
8181
sources_dir = kb_dir / "wiki" / "sources"
8282
sources_dir.mkdir(parents=True, exist_ok=True)
83-
dest_images_dir = sources_dir / "images" / pdf_path.stem
84-
85-
# Get per-page content from PageIndex — use actual page count
86-
page_count = doc.get("page_count")
87-
if page_count is None:
88-
# Fallback: count pages from structure's max end_index
89-
max_page = 0
90-
for node in structure:
91-
end = node.get("end_index", 0)
92-
if end > max_page:
93-
max_page = end
94-
page_count = max_page if max_page > 0 else 100
95-
logger.info("page_count not in doc, inferred from structure: %d", page_count)
96-
all_pages = col.get_page_content(doc_id, f"1-{page_count}")
97-
98-
# Relocate image paths in each page
99-
dest_images_dir.mkdir(parents=True, exist_ok=True)
100-
for page in all_pages:
101-
if "images" in page:
102-
for img in page["images"]:
103-
src_path = Path(img["path"])
104-
if src_path.exists():
105-
filename = src_path.name
106-
dest = dest_images_dir / filename
107-
if not dest.exists():
108-
shutil.copy2(src_path, dest)
109-
new_path = f"images/{pdf_path.stem}/{filename}"
110-
# Also fix image references in page content
111-
if "content" in page:
112-
page["content"] = page["content"].replace(str(src_path), new_path)
113-
img["path"] = new_path
83+
images_dir = sources_dir / "images" / pdf_path.stem
84+
85+
from openkb.images import convert_pdf_to_pages
86+
all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
11487

11588
(sources_dir / f"{pdf_path.stem}.json").write_text(
11689
json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",

0 commit comments

Comments
 (0)