Skip to content

Commit 8c5bc2f

Browse files
committed
Use cloud OCR for per-page content in cloud mode
When PAGEINDEX_API_KEY is set, index_long_document now fetches per-page markdown via col.get_page_content() instead of running local pymupdf. Cloud OCR produces cleaner output (preserves tables, math, and section headers) than raw pymupdf text extraction. Falls back to local pymupdf if the cloud call raises or returns an empty result.
1 parent 771452d commit 8c5bc2f

1 file changed

Lines changed: 17 additions & 2 deletions

File tree

openkb/indexer.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,28 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
7777
"structure": structure,
7878
}
7979

80-
# Write wiki/sources/ — extract per-page content with pymupdf (not PageIndex)
80+
# Write wiki/sources/ — per-page content
8181
sources_dir = kb_dir / "wiki" / "sources"
8282
sources_dir.mkdir(parents=True, exist_ok=True)
8383
images_dir = sources_dir / "images" / pdf_path.stem
8484

8585
from openkb.images import convert_pdf_to_pages
86-
all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
86+
87+
all_pages: list = []
88+
if pageindex_api_key:
89+
# Cloud mode: fetch OCR'd markdown from PageIndex. get_page_content
90+
# requires a page range, so pass "1-N".
91+
from openkb.converter import get_pdf_page_count
92+
page_count = get_pdf_page_count(pdf_path)
93+
try:
94+
all_pages = col.get_page_content(doc_id, f"1-{page_count}")
95+
except Exception as exc:
96+
logger.warning("Cloud get_page_content failed for %s: %s", pdf_path.name, exc)
97+
98+
if not all_pages:
99+
if pageindex_api_key:
100+
logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name)
101+
all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
87102

88103
(sources_dir / f"{pdf_path.stem}.json").write_text(
89104
json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",

0 commit comments

Comments
 (0)