Skip to content

Commit 1b60898

Browse files
committed
feat: add SQLite backend and migration tests
1 parent e3bee82 commit 1b60898

6 files changed

Lines changed: 338 additions & 6 deletions

File tree

tests/test_cli.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from unittest.mock import patch
33

44
import pytest
5+
import yaml
56
from click.testing import CliRunner
67

78
from openkb.cli import cli
@@ -30,11 +31,11 @@ def test_init_creates_structure(tmp_path):
3031
assert (cwd / "wiki" / "log.md").is_file()
3132
assert (cwd / "wiki" / "index.md").is_file()
3233
assert (cwd / ".openkb" / "config.yaml").is_file()
33-
assert (cwd / ".openkb" / "hashes.json").is_file()
34+
# SQLite DB 在首次访问时由 get_registry() 惰性创建
35+
assert not (cwd / ".openkb" / "hashes.json").exists()
3436

35-
# hashes.json is empty object
36-
hashes = json.loads((cwd / ".openkb" / "hashes.json").read_text())
37-
assert hashes == {}
37+
config = yaml.safe_load((cwd / ".openkb" / "config.yaml").read_text())
38+
assert config["storage_backend"] == "sqlite"
3839

3940
# index.md header
4041
index_content = (cwd / "wiki" / "index.md").read_text()
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""Tests for storage_backend config option."""
2+
from __future__ import annotations
3+
4+
from pathlib import Path
5+
6+
from openkb.config import DEFAULT_CONFIG, load_config, save_config
7+
8+
9+
def test_default_config_has_storage_backend():
10+
"""DEFAULT_CONFIG should include storage_backend key."""
11+
assert "storage_backend" in DEFAULT_CONFIG
12+
13+
14+
def test_default_storage_backend_is_sqlite():
15+
"""Default storage_backend should be 'sqlite'."""
16+
assert DEFAULT_CONFIG["storage_backend"] == "sqlite"
17+
18+
19+
def test_load_config_includes_storage_backend(tmp_path):
20+
"""load_config should return storage_backend from config file."""
21+
config_path = tmp_path / "config.yaml"
22+
save_config(config_path, {"storage_backend": "json"})
23+
loaded = load_config(config_path)
24+
assert loaded["storage_backend"] == "json"
25+
26+
27+
def test_storage_backend_valid_values(tmp_path):
28+
"""storage_backend should accept 'sqlite' or 'json'."""
29+
config_path = tmp_path / "config.yaml"
30+
31+
save_config(config_path, {"storage_backend": "sqlite"})
32+
loaded = load_config(config_path)
33+
assert loaded["storage_backend"] == "sqlite"
34+
35+
save_config(config_path, {"storage_backend": "json"})
36+
loaded = load_config(config_path)
37+
assert loaded["storage_backend"] == "json"

tests/test_converter.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,15 @@ def test_md_file_copied_to_wiki_sources(self, kb_dir):
4848

4949
def test_md_duplicate_skipped(self, kb_dir):
5050
"""Second call with same file returns skipped=True when hash is registered."""
51-
from openkb.state import HashRegistry
51+
from openkb.state import get_registry
5252

5353
src = kb_dir / "raw" / "notes.md"
5454
src.write_text("# Notes\n\nSome content here.", encoding="utf-8")
5555

5656
result1 = convert_document(src, kb_dir) # first call
5757
# Simulate CLI registering the hash after successful compilation
58-
registry = HashRegistry(kb_dir / ".openkb" / "hashes.json")
58+
openkb_dir = kb_dir / ".openkb"
59+
registry = get_registry(openkb_dir, backend="sqlite")
5960
registry.add(result1.file_hash, {"name": src.name, "type": "md"})
6061

6162
result2 = convert_document(src, kb_dir) # second call

tests/test_db_registry.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
"""Tests for DbRegistry SQLite-backed storage."""
2+
from __future__ import annotations
3+
4+
import json
5+
import sqlite3
6+
from pathlib import Path
7+
8+
import pytest
9+
10+
from openkb.state import DbRegistry
11+
12+
13+
def test_db_registry_creates_database_file(tmp_path):
14+
"""DbRegistry should create a .db file on init."""
15+
db_path = tmp_path / "hashes.db"
16+
registry = DbRegistry(db_path)
17+
assert db_path.exists()
18+
19+
20+
def test_db_registry_creates_table(tmp_path):
21+
"""DbRegistry should create the registry table."""
22+
db_path = tmp_path / "hashes.db"
23+
registry = DbRegistry(db_path)
24+
25+
conn = sqlite3.connect(str(db_path))
26+
cursor = conn.execute(
27+
"SELECT name FROM sqlite_master WHERE type='table' AND name='registry'"
28+
)
29+
result = cursor.fetchone()
30+
conn.close()
31+
assert result is not None
32+
33+
34+
def test_db_empty_registry_is_known_false(tmp_path):
35+
"""Empty DbRegistry should return False for is_known."""
36+
registry = DbRegistry(tmp_path / "hashes.db")
37+
assert registry.is_known("abc123") is False
38+
39+
40+
def test_db_empty_registry_get_returns_none(tmp_path):
41+
"""Empty DbRegistry should return None for get."""
42+
registry = DbRegistry(tmp_path / "hashes.db")
43+
assert registry.get("abc123") is None
44+
45+
46+
def test_db_add_and_is_known(tmp_path):
47+
"""After add, is_known should return True."""
48+
registry = DbRegistry(tmp_path / "hashes.db")
49+
registry.add("deadbeef", {"filename": "test.pdf"})
50+
assert registry.is_known("deadbeef") is True
51+
52+
53+
def test_db_add_and_get(tmp_path):
54+
"""After add, get should return the metadata."""
55+
registry = DbRegistry(tmp_path / "hashes.db")
56+
metadata = {"filename": "doc.pdf", "pages": 10}
57+
registry.add("cafebabe", metadata)
58+
assert registry.get("cafebabe") == metadata
59+
60+
61+
def test_db_persistence_across_instances(tmp_path):
62+
"""Data should persist across DbRegistry instances."""
63+
db_path = tmp_path / "hashes.db"
64+
r1 = DbRegistry(db_path)
65+
r1.add("hash1", {"file": "a.pdf"})
66+
67+
r2 = DbRegistry(db_path)
68+
assert r2.is_known("hash1") is True
69+
assert r2.get("hash1") == {"file": "a.pdf"}
70+
71+
72+
def test_db_all_entries_returns_all(tmp_path):
73+
"""all_entries should return all hash -> metadata mappings."""
74+
registry = DbRegistry(tmp_path / "hashes.db")
75+
registry.add("h1", {"name": "one"})
76+
registry.add("h2", {"name": "two"})
77+
entries = registry.all_entries()
78+
assert "h1" in entries
79+
assert "h2" in entries
80+
assert entries["h1"] == {"name": "one"}
81+
assert entries["h2"] == {"name": "two"}
82+
83+
84+
def test_db_all_entries_empty(tmp_path):
85+
"""all_entries on empty registry should return empty dict."""
86+
registry = DbRegistry(tmp_path / "hashes.db")
87+
assert registry.all_entries() == {}
88+
89+
90+
def test_db_hash_file_unchanged(tmp_path):
91+
"""DbRegistry.hash_file should work same as HashRegistry."""
92+
f = tmp_path / "sample.txt"
93+
f.write_text("hello world")
94+
digest = DbRegistry.hash_file(f)
95+
assert len(digest) == 64
96+
assert all(c in "0123456789abcdef" for c in digest)
97+
98+
99+
def test_db_update_existing_hash(tmp_path):
100+
"""Adding same hash twice should update metadata."""
101+
registry = DbRegistry(tmp_path / "hashes.db")
102+
registry.add("hash1", {"version": 1})
103+
registry.add("hash1", {"version": 2})
104+
assert registry.get("hash1") == {"version": 2}
105+
106+
107+
def test_db_metadata_with_nested_dict(tmp_path):
108+
"""Metadata can contain nested dictionaries."""
109+
registry = DbRegistry(tmp_path / "hashes.db")
110+
metadata = {
111+
"name": "doc.pdf",
112+
"stats": {"pages": 10, "words": 5000},
113+
}
114+
registry.add("hash1", metadata)
115+
assert registry.get("hash1") == metadata
116+
117+
118+
def test_db_wal_mode_enabled(tmp_path):
119+
"""Database should use WAL mode for concurrency."""
120+
db_path = tmp_path / "hashes.db"
121+
DbRegistry(db_path)
122+
123+
conn = sqlite3.connect(str(db_path))
124+
cursor = conn.execute("PRAGMA journal_mode")
125+
result = cursor.fetchone()
126+
conn.close()
127+
assert result[0].lower() == "wal"
128+
129+
130+
def test_migrate_from_json(tmp_path):
131+
"""DbRegistry should migrate existing JSON data on first access."""
132+
json_path = tmp_path / "hashes.json"
133+
existing_data = {
134+
"hash1": {"name": "doc1.pdf", "pages": 10},
135+
"hash2": {"name": "doc2.pdf", "pages": 20},
136+
}
137+
json_path.write_text(json.dumps(existing_data), encoding="utf-8")
138+
139+
db_path = tmp_path / "hashes.db"
140+
registry = DbRegistry(db_path, migrate_from=json_path)
141+
142+
assert registry.is_known("hash1")
143+
assert registry.is_known("hash2")
144+
assert registry.get("hash1") == {"name": "doc1.pdf", "pages": 10}
145+
assert registry.get("hash2") == {"name": "doc2.pdf", "pages": 20}
146+
147+
148+
def test_migrate_only_once(tmp_path):
149+
"""Migration should only happen once, not on subsequent loads."""
150+
json_path = tmp_path / "hashes.json"
151+
existing_data = {"hash1": {"name": "doc1.pdf"}}
152+
json_path.write_text(json.dumps(existing_data), encoding="utf-8")
153+
154+
db_path = tmp_path / "hashes.db"
155+
156+
r1 = DbRegistry(db_path, migrate_from=json_path)
157+
assert r1.is_known("hash1")
158+
159+
existing_data["hash2"] = {"name": "doc2.pdf"}
160+
json_path.write_text(json.dumps(existing_data), encoding="utf-8")
161+
162+
r2 = DbRegistry(db_path, migrate_from=json_path)
163+
assert r2.is_known("hash1")
164+
assert not r2.is_known("hash2")
165+
166+
167+
def test_migrate_optional(tmp_path):
168+
"""DbRegistry should work without migration."""
169+
db_path = tmp_path / "hashes.db"
170+
registry = DbRegistry(db_path)
171+
registry.add("hash1", {"name": "doc.pdf"})
172+
assert registry.is_known("hash1")

tests/test_migration.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""Integration tests for JSON to SQLite migration."""
2+
from __future__ import annotations
3+
4+
import json
5+
import threading
6+
from pathlib import Path
7+
8+
import pytest
9+
10+
from openkb.state import get_registry, DbRegistry
11+
12+
13+
def test_full_migration_workflow(tmp_path):
14+
"""Test complete migration from JSON to SQLite."""
15+
openkb_dir = tmp_path / ".openkb"
16+
openkb_dir.mkdir()
17+
18+
# Step 1: Start with JSON backend
19+
json_registry = get_registry(openkb_dir, backend="json")
20+
json_registry.add("hash1", {"name": "doc1.pdf", "pages": 10})
21+
json_registry.add("hash2", {"name": "doc2.pdf", "pages": 20})
22+
23+
# Verify JSON file exists
24+
json_path = openkb_dir / "hashes.json"
25+
assert json_path.exists()
26+
27+
# Step 2: Switch to SQLite backend (triggers migration)
28+
sqlite_registry = get_registry(openkb_dir, backend="sqlite")
29+
30+
# Verify data was migrated
31+
assert sqlite_registry.is_known("hash1")
32+
assert sqlite_registry.is_known("hash2")
33+
assert sqlite_registry.get("hash1") == {"name": "doc1.pdf", "pages": 10}
34+
assert sqlite_registry.get("hash2") == {"name": "doc2.pdf", "pages": 20}
35+
36+
# Step 3: Add new data via SQLite
37+
sqlite_registry.add("hash3", {"name": "doc3.pdf", "pages": 30})
38+
39+
# Step 4: Create new SQLite instance - should have all data
40+
sqlite_registry2 = get_registry(openkb_dir, backend="sqlite")
41+
assert sqlite_registry2.is_known("hash1")
42+
assert sqlite_registry2.is_known("hash2")
43+
assert sqlite_registry2.is_known("hash3")
44+
45+
46+
def test_concurrent_sqlite_access(tmp_path):
47+
"""Test that SQLite handles concurrent access correctly."""
48+
openkb_dir = tmp_path / ".openkb"
49+
openkb_dir.mkdir()
50+
51+
registry = get_registry(openkb_dir, backend="sqlite")
52+
errors = []
53+
54+
def add_entries(start: int, count: int) -> None:
55+
try:
56+
for i in range(start, start + count):
57+
registry.add(f"hash{i}", {"index": i})
58+
except Exception as e:
59+
errors.append(e)
60+
61+
threads = [
62+
threading.Thread(target=add_entries, args=(0, 50)),
63+
threading.Thread(target=add_entries, args=(50, 50)),
64+
threading.Thread(target=add_entries, args=(100, 50)),
65+
]
66+
67+
for t in threads:
68+
t.start()
69+
for t in threads:
70+
t.join()
71+
72+
assert not errors
73+
entries = registry.all_entries()
74+
assert len(entries) == 150

tests/test_state.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,50 @@ def test_load_existing_json(tmp_path):
8282
registry = HashRegistry(path)
8383
assert registry.is_known("existinghash") is True
8484
assert registry.get("existinghash") == {"file": "pre.pdf"}
85+
86+
87+
# ---------------------------------------------------------------------------
88+
# Factory function tests
89+
# ---------------------------------------------------------------------------
90+
91+
from openkb.state import get_registry
92+
93+
94+
def test_get_registry_returns_db_registry_by_default(tmp_path):
95+
"""get_registry should return DbRegistry by default."""
96+
openkb_dir = tmp_path / ".openkb"
97+
openkb_dir.mkdir()
98+
99+
registry = get_registry(openkb_dir)
100+
assert type(registry).__name__ == "DbRegistry"
101+
102+
103+
def test_get_registry_returns_hash_registry_for_json_backend(tmp_path):
104+
"""get_registry should return HashRegistry when backend is 'json'."""
105+
openkb_dir = tmp_path / ".openkb"
106+
openkb_dir.mkdir()
107+
108+
registry = get_registry(openkb_dir, backend="json")
109+
assert type(registry).__name__ == "HashRegistry"
110+
111+
112+
def test_get_registry_returns_db_registry_for_sqlite_backend(tmp_path):
113+
"""get_registry should return DbRegistry when backend is 'sqlite'."""
114+
openkb_dir = tmp_path / ".openkb"
115+
openkb_dir.mkdir()
116+
117+
registry = get_registry(openkb_dir, backend="sqlite")
118+
assert type(registry).__name__ == "DbRegistry"
119+
120+
121+
def test_get_registry_migrates_json_to_sqlite(tmp_path):
122+
"""get_registry should migrate existing JSON when switching to sqlite."""
123+
openkb_dir = tmp_path / ".openkb"
124+
openkb_dir.mkdir()
125+
126+
json_path = openkb_dir / "hashes.json"
127+
json_path.write_text('{"hash1": {"name": "doc.pdf"}}', encoding="utf-8")
128+
129+
registry = get_registry(openkb_dir, backend="sqlite")
130+
assert registry.is_known("hash1")
131+
assert registry.get("hash1") == {"name": "doc.pdf"}

0 commit comments

Comments
 (0)