Skip to content

Commit e3bee82

Browse files
committed
feat: add SQLite-backed registry
1 parent 0b19f9c commit e3bee82

4 files changed

Lines changed: 186 additions & 21 deletions

File tree

openkb/cli.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -138,14 +138,15 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
138138
4. Else: compile_short_doc.
139139
"""
140140
from openkb.agent.compiler import compile_long_doc, compile_short_doc
141-
from openkb.state import HashRegistry
141+
from openkb.state import get_registry
142142

143143
logger = logging.getLogger(__name__)
144144
openkb_dir = kb_dir / ".openkb"
145145
config = load_config(openkb_dir / "config.yaml")
146146
_setup_llm_key(kb_dir)
147147
model: str = config.get("model", DEFAULT_CONFIG["model"])
148-
registry = HashRegistry(openkb_dir / "hashes.json")
148+
backend = config.get("storage_backend", "sqlite")
149+
registry = get_registry(openkb_dir, backend=backend)
149150

150151
# 2. Convert document
151152
click.echo(f"Adding: {file_path.name}")
@@ -299,9 +300,10 @@ def init():
299300
"model": model,
300301
"language": DEFAULT_CONFIG["language"],
301302
"pageindex_threshold": DEFAULT_CONFIG["pageindex_threshold"],
303+
"storage_backend": DEFAULT_CONFIG["storage_backend"],
302304
}
303305
save_config(openkb_dir / "config.yaml", config)
304-
(openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8")
306+
# SQLite DB 会在首次访问时由 get_registry() 自动创建,无需预创建
305307

306308
# Write API key to KB-local .env (0600) if the user provided one
307309
if api_key:
@@ -590,13 +592,13 @@ def list_cmd(ctx):
590592
click.echo("No knowledge base found. Run `openkb init` first.")
591593
return
592594

593-
openkb_dir = kb_dir / ".openkb"
594-
hashes_file = openkb_dir / "hashes.json"
595-
if not hashes_file.exists():
596-
click.echo("No documents indexed yet.")
597-
return
595+
from openkb.state import get_registry
598596

599-
hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
597+
openkb_dir = kb_dir / ".openkb"
598+
config = load_config(openkb_dir / "config.yaml")
599+
backend = config.get("storage_backend", "sqlite")
600+
registry = get_registry(openkb_dir, backend=backend)
601+
hashes = registry.all_entries()
600602
if not hashes:
601603
click.echo("No documents indexed yet.")
602604
return
@@ -673,11 +675,14 @@ def status(ctx):
673675
click.echo(f" {'raw':<20} {raw_count:<10}")
674676

675677
# Hash registry summary
678+
from openkb.state import get_registry
679+
676680
openkb_dir = kb_dir / ".openkb"
677-
hashes_file = openkb_dir / "hashes.json"
678-
if hashes_file.exists():
679-
hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
680-
click.echo(f"\n Total indexed: {len(hashes)} document(s)")
681+
config = load_config(openkb_dir / "config.yaml")
682+
backend = config.get("storage_backend", "sqlite")
683+
registry = get_registry(openkb_dir, backend=backend)
684+
hashes = registry.all_entries()
685+
click.echo(f"\n Total indexed: {len(hashes)} document(s)")
681686

682687
# Last compile time: newest file in wiki/summaries/
683688
summaries_dir = wiki_dir / "summaries"

openkb/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
"model": "gpt-5.4-mini",
1010
"language": "en",
1111
"pageindex_threshold": 20,
12+
"storage_backend": "sqlite",
1213
}
1314

1415
GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb"

openkb/converter.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from openkb.config import load_config
1313
from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images
14-
from openkb.state import HashRegistry
14+
from openkb.state import get_registry
1515

1616
logger = logging.getLogger(__name__)
1717

@@ -50,12 +50,13 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult:
5050
openkb_dir = kb_dir / ".openkb"
5151
config = load_config(openkb_dir / "config.yaml")
5252
threshold: int = config.get("pageindex_threshold", 20)
53-
registry = HashRegistry(openkb_dir / "hashes.json")
53+
backend = config.get("storage_backend", "sqlite")
54+
registry = get_registry(openkb_dir, backend=backend)
5455

5556
# ------------------------------------------------------------------
5657
# 1. Hash check
5758
# ------------------------------------------------------------------
58-
file_hash = HashRegistry.hash_file(src)
59+
file_hash = registry.hash_file(src)
5960
if registry.is_known(file_hash):
6061
logger.info("Skipping already-known file: %s", src.name)
6162
return ConvertResult(skipped=True)

openkb/state.py

Lines changed: 163 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,19 @@
22

33
import hashlib
44
import json
5+
import sqlite3
6+
from contextlib import contextmanager
57
from pathlib import Path
8+
from typing import Iterator
9+
10+
11+
def _hash_file(path: Path) -> str:
12+
"""Return the SHA-256 hex digest (64 chars) of the file at path."""
13+
h = hashlib.sha256()
14+
with path.open("rb") as fh:
15+
for chunk in iter(lambda: fh.read(65536), b""):
16+
h.update(chunk)
17+
return h.hexdigest()
618

719

820
class HashRegistry:
@@ -57,8 +69,154 @@ def _persist(self) -> None:
5769
@staticmethod
5870
def hash_file(path: Path) -> str:
5971
"""Return the SHA-256 hex digest (64 chars) of the file at path."""
60-
h = hashlib.sha256()
61-
with path.open("rb") as fh:
62-
for chunk in iter(lambda: fh.read(65536), b""):
63-
h.update(chunk)
64-
return h.hexdigest()
72+
return _hash_file(path)
73+
74+
75+
class DbRegistry:
76+
"""SQLite-backed registry mapping file SHA-256 hashes to metadata dicts.
77+
78+
Provides better scalability, concurrency support, and extensibility
79+
compared to JSON-backed HashRegistry.
80+
"""
81+
82+
def __init__(self, path: Path, migrate_from: Path | None = None) -> None:
83+
"""Initialize DbRegistry.
84+
85+
Args:
86+
path: Path to SQLite database file.
87+
migrate_from: Optional path to JSON file to migrate from.
88+
Migration only happens if DB doesn't exist yet.
89+
"""
90+
self._path = path
91+
should_migrate = migrate_from is not None and not path.exists()
92+
self._init_db()
93+
if should_migrate:
94+
self._migrate_from_json(migrate_from)
95+
96+
def _migrate_from_json(self, json_path: Path) -> None:
97+
"""Migrate data from JSON file to SQLite database."""
98+
if not json_path.exists():
99+
return
100+
101+
with json_path.open("r", encoding="utf-8") as fh:
102+
data: dict[str, dict] = json.load(fh)
103+
104+
with self._connect() as conn:
105+
for file_hash, metadata in data.items():
106+
metadata_json = json.dumps(metadata, ensure_ascii=False)
107+
conn.execute("""
108+
INSERT OR REPLACE INTO registry (file_hash, metadata_json)
109+
VALUES (?, ?)
110+
""", (file_hash, metadata_json))
111+
112+
def _init_db(self) -> None:
113+
"""Initialize database schema if not exists."""
114+
self._path.parent.mkdir(parents=True, exist_ok=True)
115+
116+
with self._connect() as conn:
117+
conn.execute("PRAGMA journal_mode=WAL")
118+
conn.execute("PRAGMA foreign_keys=ON")
119+
conn.execute("""
120+
CREATE TABLE IF NOT EXISTS registry (
121+
file_hash TEXT PRIMARY KEY,
122+
metadata_json TEXT NOT NULL,
123+
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
124+
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
125+
)
126+
""")
127+
conn.execute("""
128+
CREATE INDEX IF NOT EXISTS idx_created_at ON registry(created_at)
129+
""")
130+
131+
@contextmanager
132+
def _connect(self) -> Iterator[sqlite3.Connection]:
133+
"""Context manager for database connections."""
134+
conn = sqlite3.connect(str(self._path))
135+
try:
136+
yield conn
137+
conn.commit()
138+
finally:
139+
conn.close()
140+
141+
def is_known(self, file_hash: str) -> bool:
142+
"""Return True if file_hash is already registered."""
143+
with self._connect() as conn:
144+
cursor = conn.execute(
145+
"SELECT 1 FROM registry WHERE file_hash = ?",
146+
(file_hash,)
147+
)
148+
return cursor.fetchone() is not None
149+
150+
def get(self, file_hash: str) -> dict | None:
151+
"""Return metadata for file_hash, or None if not found."""
152+
with self._connect() as conn:
153+
cursor = conn.execute(
154+
"SELECT metadata_json FROM registry WHERE file_hash = ?",
155+
(file_hash,)
156+
)
157+
row = cursor.fetchone()
158+
if row is None:
159+
return None
160+
return json.loads(row[0])
161+
162+
def all_entries(self) -> dict[str, dict]:
163+
"""Return a shallow copy of all hash -> metadata entries."""
164+
with self._connect() as conn:
165+
cursor = conn.execute(
166+
"SELECT file_hash, metadata_json FROM registry"
167+
)
168+
return {
169+
row[0]: json.loads(row[1])
170+
for row in cursor.fetchall()
171+
}
172+
173+
def add(self, file_hash: str, metadata: dict) -> None:
174+
"""Register file_hash with metadata and persist to disk.
175+
176+
If file_hash already exists, updates the metadata.
177+
"""
178+
metadata_json = json.dumps(metadata, ensure_ascii=False)
179+
with self._connect() as conn:
180+
conn.execute("""
181+
INSERT INTO registry (file_hash, metadata_json, updated_at)
182+
VALUES (?, ?, CURRENT_TIMESTAMP)
183+
ON CONFLICT(file_hash) DO UPDATE SET
184+
metadata_json = excluded.metadata_json,
185+
updated_at = CURRENT_TIMESTAMP
186+
""", (file_hash, metadata_json))
187+
188+
@staticmethod
189+
def hash_file(path: Path) -> str:
190+
"""Return the SHA-256 hex digest (64 chars) of the file at path."""
191+
return _hash_file(path)
192+
193+
194+
def get_registry(
195+
openkb_dir: Path,
196+
backend: str = "sqlite",
197+
) -> HashRegistry | DbRegistry:
198+
"""Factory function to get the appropriate registry implementation.
199+
200+
Args:
201+
openkb_dir: Path to .openkb directory.
202+
backend: Storage backend - "sqlite" or "json".
203+
204+
Returns:
205+
HashRegistry for "json" backend, DbRegistry for "sqlite" backend.
206+
207+
When switching from json to sqlite and a JSON file exists,
208+
automatically migrates the data.
209+
"""
210+
if backend not in ("sqlite", "json"):
211+
raise ValueError(f"Unknown storage_backend: {backend!r}")
212+
213+
if backend == "json":
214+
return HashRegistry(openkb_dir / "hashes.json")
215+
216+
db_path = openkb_dir / "hashes.db"
217+
json_path = openkb_dir / "hashes.json"
218+
219+
if json_path.exists() and not db_path.exists():
220+
return DbRegistry(db_path, migrate_from=json_path)
221+
222+
return DbRegistry(db_path)

0 commit comments

Comments
 (0)