Merge pull request #14 from VectifyAI/dev

rejojer · web-flow · commit 2e1caf90c8e8 · 2026-04-11T01:48:49.000+08:00
Cloud OCR indexing, pageindex dev1 bump, warning cleanup
diff --git a/openkb/cli.py b/openkb/cli.py
@@ -1,6 +1,12 @@
 """OpenKB CLI — command-line interface for the knowledge base workflow."""
 from __future__ import annotations
 
+# Silence import-time warnings (e.g. pydub's missing-ffmpeg warning emitted
+# when markitdown pulls it in). markitdown later clobbers the filters during
+# its own import, so we re-apply after all imports below.
+import warnings
+warnings.filterwarnings("ignore")
+
 import asyncio
 import json
 import logging
@@ -256,22 +262,23 @@ def init():
         return
 
     # Interactive prompts
+    click.echo("Pick an LLM in `provider/model` LiteLLM format:")
+    click.echo("  OpenAI:    gpt-5.4-mini, gpt-5.4")
+    click.echo("  Anthropic: anthropic/claude-sonnet-4-6, anthropic/claude-opus-4-6")
+    click.echo("  Gemini:    gemini/gemini-3.1-pro-preview, gemini/gemini-3-flash-preview")
+    click.echo("  Others:    see https://docs.litellm.ai/docs/providers")
+    click.echo()
     model = click.prompt(
-        f"Model (e.g. gpt-5.4-mini, anthropic/claude-sonnet-4-6) [default: {DEFAULT_CONFIG['model']}]",
+        f"Model (enter for default {DEFAULT_CONFIG['model']})",
         default=DEFAULT_CONFIG["model"],
         show_default=False,
     )
-    language = click.prompt(
-        f"Language [default: {DEFAULT_CONFIG['language']}]",
-        default=DEFAULT_CONFIG["language"],
-        show_default=False,
-    )
-    pageindex_threshold = click.prompt(
-        f"PageIndex threshold (pages) [default: {DEFAULT_CONFIG['pageindex_threshold']}]",
-        default=DEFAULT_CONFIG["pageindex_threshold"],
-        type=int,
+    api_key = click.prompt(
+        "LLM API Key (saved to .env, enter to skip)",
+        default="",
+        hide_input=True,
         show_default=False,
-    )
+    ).strip()
     # Create directory structure
     Path("raw").mkdir(exist_ok=True)
     Path("wiki/sources/images").mkdir(parents=True, exist_ok=True)
@@ -290,12 +297,22 @@ def init():
     openkb_dir.mkdir()
     config = {
         "model": model,
-        "language": language,
-        "pageindex_threshold": pageindex_threshold,
+        "language": DEFAULT_CONFIG["language"],
+        "pageindex_threshold": DEFAULT_CONFIG["pageindex_threshold"],
     }
     save_config(openkb_dir / "config.yaml", config)
     (openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8")
 
+    # Write API key to KB-local .env (0600) if the user provided one
+    if api_key:
+        env_path = Path(".env")
+        if env_path.exists():
+            click.echo(".env already exists, skipping write. Add LLM_API_KEY manually if needed.")
+        else:
+            env_path.write_text(f"LLM_API_KEY={api_key}\n", encoding="utf-8")
+            os.chmod(env_path, 0o600)
+            click.echo("Saved LLM API key to .env.")
+
     # Register this KB in the global config
     register_kb(Path.cwd())
 
diff --git a/openkb/indexer.py b/openkb/indexer.py
@@ -77,13 +77,28 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
         "structure": structure,
     }
 
-    # Write wiki/sources/ — extract per-page content with pymupdf (not PageIndex)
+    # Write wiki/sources/ — per-page content
     sources_dir = kb_dir / "wiki" / "sources"
     sources_dir.mkdir(parents=True, exist_ok=True)
     images_dir = sources_dir / "images" / pdf_path.stem
 
     from openkb.images import convert_pdf_to_pages
-    all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
+
+    all_pages: list = []
+    if pageindex_api_key:
+        # Cloud mode: fetch OCR'd markdown from PageIndex. get_page_content
+        # requires a page range, so pass "1-N".
+        from openkb.converter import get_pdf_page_count
+        page_count = get_pdf_page_count(pdf_path)
+        try:
+            all_pages = col.get_page_content(doc_id, f"1-{page_count}")
+        except Exception as exc:
+            logger.warning("Cloud get_page_content failed for %s: %s", pdf_path.name, exc)
+
+    if not all_pages:
+        if pageindex_api_key:
+            logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name)
+        all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
 
     (sources_dir / f"{pdf_path.stem}.json").write_text(
         json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,7 @@ classifiers = [
 ]
 keywords = ["ai", "rag", "retrieval", "knowledge-base", "llm", "pageindex", "agents", "document"]
 dependencies = [
-    "pageindex==0.3.0.dev0",
+    "pageindex==0.3.0.dev1",
     "markitdown[all]",
     "click>=8.0",
     "watchdog>=3.0",

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ classifiers = [`
`26`	`26`	`]`
`27`	`27`	`keywords = ["ai", "rag", "retrieval", "knowledge-base", "llm", "pageindex", "agents", "document"]`
`28`	`28`	`dependencies = [`
`29`		`- "pageindex==0.3.0.dev0",`
	`29`	`+ "pageindex==0.3.0.dev1",`
`30`	`30`	`"markitdown[all]",`
`31`	`31`	`"click>=8.0",`
`32`	`32`	`"watchdog>=3.0",`