Skip to content

Commit 27a9e3a

Browse files
committed
fix: change default model to gpt-5.4-mini, fix page_count fallback in indexer
- Default model changed from gpt-5.4 to gpt-5.4-mini - Indexer get_page_content no longer uses hardcoded 9999 fallback - Infers page_count from structure end_index when doc lacks page_count field - Added debug logging for doc keys and page_count diagnosis
1 parent 36ae619 commit 27a9e3a

2 files changed

Lines changed: 47 additions & 3 deletions

File tree

openkb/config.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,14 @@
66
import yaml
77

88
DEFAULT_CONFIG: dict[str, Any] = {
9-
"model": "gpt-5.4",
9+
"model": "gpt-5.4-mini",
1010
"language": "en",
1111
"pageindex_threshold": 20,
1212
}
1313

14+
GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb"
15+
GLOBAL_CONFIG_PATH = GLOBAL_CONFIG_DIR / "global.yaml"
16+
1417

1518
def load_config(config_path: Path) -> dict[str, Any]:
1619
"""Load YAML config from config_path, merged with DEFAULT_CONFIG.
@@ -30,3 +33,30 @@ def save_config(config_path: Path, config: dict) -> None:
3033
config_path.parent.mkdir(parents=True, exist_ok=True)
3134
with config_path.open("w", encoding="utf-8") as fh:
3235
yaml.safe_dump(config, fh, allow_unicode=True, sort_keys=True)
36+
37+
38+
def load_global_config() -> dict[str, Any]:
39+
"""Load the global config from ~/.config/openkb/global.yaml."""
40+
if GLOBAL_CONFIG_PATH.exists():
41+
with GLOBAL_CONFIG_PATH.open("r", encoding="utf-8") as fh:
42+
return yaml.safe_load(fh) or {}
43+
return {}
44+
45+
46+
def save_global_config(config: dict[str, Any]) -> None:
47+
"""Save the global config to ~/.config/openkb/global.yaml."""
48+
GLOBAL_CONFIG_DIR.mkdir(parents=True, exist_ok=True)
49+
with GLOBAL_CONFIG_PATH.open("w", encoding="utf-8") as fh:
50+
yaml.safe_dump(config, fh, allow_unicode=True, sort_keys=True)
51+
52+
53+
def register_kb(kb_path: Path) -> None:
54+
"""Register a KB path in the global config's known_kbs list."""
55+
gc = load_global_config()
56+
known = gc.get("known_kbs", [])
57+
resolved = str(kb_path.resolve())
58+
if resolved not in known:
59+
known.append(resolved)
60+
gc["known_kbs"] = known
61+
gc["default_kb"] = resolved
62+
save_global_config(gc)

openkb/indexer.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
6767
description: str = doc.get("doc_description", "")
6868
structure: list = doc.get("structure", [])
6969

70+
# Debug: print doc keys and page_count to diagnose get_page_content range
71+
logger.info("Doc keys: %s", list(doc.keys()))
72+
logger.info("page_count from doc: %s", doc.get("page_count", "NOT PRESENT"))
73+
7074
tree = {
7175
"doc_name": doc_name,
7276
"doc_description": description,
@@ -78,8 +82,18 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
7882
sources_dir.mkdir(parents=True, exist_ok=True)
7983
dest_images_dir = sources_dir / "images" / pdf_path.stem
8084

81-
# Get per-page content from PageIndex
82-
all_pages = col.get_page_content(doc_id, f"1-{doc.get('page_count', 9999)}")
85+
# Get per-page content from PageIndex — use actual page count
86+
page_count = doc.get("page_count")
87+
if page_count is None:
88+
# Fallback: count pages from structure's max end_index
89+
max_page = 0
90+
for node in structure:
91+
end = node.get("end_index", 0)
92+
if end > max_page:
93+
max_page = end
94+
page_count = max_page if max_page > 0 else 100
95+
logger.info("page_count not in doc, inferred from structure: %d", page_count)
96+
all_pages = col.get_page_content(doc_id, f"1-{page_count}")
8397

8498
# Relocate image paths in each page
8599
dest_images_dir.mkdir(parents=True, exist_ok=True)

0 commit comments

Comments
 (0)