Skip to content

Commit 2e1caf9

Browse files
authored
Merge pull request #14 from VectifyAI/dev
Cloud OCR indexing, pageindex dev1 bump, warning cleanup
2 parents 0291ec9 + b77e95d commit 2e1caf9

3 files changed

Lines changed: 48 additions & 16 deletions

File tree

openkb/cli.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
"""OpenKB CLI — command-line interface for the knowledge base workflow."""
22
from __future__ import annotations
33

4+
# Silence import-time warnings (e.g. pydub's missing-ffmpeg warning emitted
5+
# when markitdown pulls it in). markitdown later clobbers the filters during
6+
# its own import, so we re-apply after all imports below.
7+
import warnings
8+
warnings.filterwarnings("ignore")
9+
410
import asyncio
511
import json
612
import logging
@@ -256,22 +262,23 @@ def init():
256262
return
257263

258264
# Interactive prompts
265+
click.echo("Pick an LLM in `provider/model` LiteLLM format:")
266+
click.echo(" OpenAI: gpt-5.4-mini, gpt-5.4")
267+
click.echo(" Anthropic: anthropic/claude-sonnet-4-6, anthropic/claude-opus-4-6")
268+
click.echo(" Gemini: gemini/gemini-3.1-pro-preview, gemini/gemini-3-flash-preview")
269+
click.echo(" Others: see https://docs.litellm.ai/docs/providers")
270+
click.echo()
259271
model = click.prompt(
260-
f"Model (e.g. gpt-5.4-mini, anthropic/claude-sonnet-4-6) [default: {DEFAULT_CONFIG['model']}]",
272+
f"Model (enter for default {DEFAULT_CONFIG['model']})",
261273
default=DEFAULT_CONFIG["model"],
262274
show_default=False,
263275
)
264-
language = click.prompt(
265-
f"Language [default: {DEFAULT_CONFIG['language']}]",
266-
default=DEFAULT_CONFIG["language"],
267-
show_default=False,
268-
)
269-
pageindex_threshold = click.prompt(
270-
f"PageIndex threshold (pages) [default: {DEFAULT_CONFIG['pageindex_threshold']}]",
271-
default=DEFAULT_CONFIG["pageindex_threshold"],
272-
type=int,
276+
api_key = click.prompt(
277+
"LLM API Key (saved to .env, enter to skip)",
278+
default="",
279+
hide_input=True,
273280
show_default=False,
274-
)
281+
).strip()
275282
# Create directory structure
276283
Path("raw").mkdir(exist_ok=True)
277284
Path("wiki/sources/images").mkdir(parents=True, exist_ok=True)
@@ -290,12 +297,22 @@ def init():
290297
openkb_dir.mkdir()
291298
config = {
292299
"model": model,
293-
"language": language,
294-
"pageindex_threshold": pageindex_threshold,
300+
"language": DEFAULT_CONFIG["language"],
301+
"pageindex_threshold": DEFAULT_CONFIG["pageindex_threshold"],
295302
}
296303
save_config(openkb_dir / "config.yaml", config)
297304
(openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8")
298305

306+
# Write API key to KB-local .env (0600) if the user provided one
307+
if api_key:
308+
env_path = Path(".env")
309+
if env_path.exists():
310+
click.echo(".env already exists, skipping write. Add LLM_API_KEY manually if needed.")
311+
else:
312+
env_path.write_text(f"LLM_API_KEY={api_key}\n", encoding="utf-8")
313+
os.chmod(env_path, 0o600)
314+
click.echo("Saved LLM API key to .env.")
315+
299316
# Register this KB in the global config
300317
register_kb(Path.cwd())
301318

openkb/indexer.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,28 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
7777
"structure": structure,
7878
}
7979

80-
# Write wiki/sources/ — extract per-page content with pymupdf (not PageIndex)
80+
# Write wiki/sources/ — per-page content
8181
sources_dir = kb_dir / "wiki" / "sources"
8282
sources_dir.mkdir(parents=True, exist_ok=True)
8383
images_dir = sources_dir / "images" / pdf_path.stem
8484

8585
from openkb.images import convert_pdf_to_pages
86-
all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
86+
87+
all_pages: list = []
88+
if pageindex_api_key:
89+
# Cloud mode: fetch OCR'd markdown from PageIndex. get_page_content
90+
# requires a page range, so pass "1-N".
91+
from openkb.converter import get_pdf_page_count
92+
page_count = get_pdf_page_count(pdf_path)
93+
try:
94+
all_pages = col.get_page_content(doc_id, f"1-{page_count}")
95+
except Exception as exc:
96+
logger.warning("Cloud get_page_content failed for %s: %s", pdf_path.name, exc)
97+
98+
if not all_pages:
99+
if pageindex_api_key:
100+
logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name)
101+
all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
87102

88103
(sources_dir / f"{pdf_path.stem}.json").write_text(
89104
json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ classifiers = [
2626
]
2727
keywords = ["ai", "rag", "retrieval", "knowledge-base", "llm", "pageindex", "agents", "document"]
2828
dependencies = [
29-
"pageindex==0.3.0.dev0",
29+
"pageindex==0.3.0.dev1",
3030
"markitdown[all]",
3131
"click>=8.0",
3232
"watchdog>=3.0",

0 commit comments

Comments
 (0)