|
2 | 2 |
|
3 | 3 | import hashlib |
4 | 4 | import json |
| 5 | +import sqlite3 |
| 6 | +from contextlib import contextmanager |
5 | 7 | from pathlib import Path |
| 8 | +from typing import Iterator |
| 9 | + |
| 10 | + |
| 11 | +def _hash_file(path: Path) -> str: |
| 12 | + """Return the SHA-256 hex digest (64 chars) of the file at path.""" |
| 13 | + h = hashlib.sha256() |
| 14 | + with path.open("rb") as fh: |
| 15 | + for chunk in iter(lambda: fh.read(65536), b""): |
| 16 | + h.update(chunk) |
| 17 | + return h.hexdigest() |
6 | 18 |
|
7 | 19 |
|
8 | 20 | class HashRegistry: |
@@ -57,8 +69,154 @@ def _persist(self) -> None: |
57 | 69 | @staticmethod |
58 | 70 | def hash_file(path: Path) -> str: |
59 | 71 | """Return the SHA-256 hex digest (64 chars) of the file at path.""" |
60 | | - h = hashlib.sha256() |
61 | | - with path.open("rb") as fh: |
62 | | - for chunk in iter(lambda: fh.read(65536), b""): |
63 | | - h.update(chunk) |
64 | | - return h.hexdigest() |
| 72 | + return _hash_file(path) |
| 73 | + |
| 74 | + |
| 75 | +class DbRegistry: |
| 76 | + """SQLite-backed registry mapping file SHA-256 hashes to metadata dicts. |
| 77 | + |
| 78 | + Provides better scalability, concurrency support, and extensibility |
| 79 | + compared to JSON-backed HashRegistry. |
| 80 | + """ |
| 81 | + |
| 82 | + def __init__(self, path: Path, migrate_from: Path | None = None) -> None: |
| 83 | + """Initialize DbRegistry. |
| 84 | + |
| 85 | + Args: |
| 86 | + path: Path to SQLite database file. |
| 87 | + migrate_from: Optional path to JSON file to migrate from. |
| 88 | + Migration only happens if DB doesn't exist yet. |
| 89 | + """ |
| 90 | + self._path = path |
| 91 | + should_migrate = migrate_from is not None and not path.exists() |
| 92 | + self._init_db() |
| 93 | + if should_migrate: |
| 94 | + self._migrate_from_json(migrate_from) |
| 95 | + |
| 96 | + def _migrate_from_json(self, json_path: Path) -> None: |
| 97 | + """Migrate data from JSON file to SQLite database.""" |
| 98 | + if not json_path.exists(): |
| 99 | + return |
| 100 | + |
| 101 | + with json_path.open("r", encoding="utf-8") as fh: |
| 102 | + data: dict[str, dict] = json.load(fh) |
| 103 | + |
| 104 | + with self._connect() as conn: |
| 105 | + for file_hash, metadata in data.items(): |
| 106 | + metadata_json = json.dumps(metadata, ensure_ascii=False) |
| 107 | + conn.execute(""" |
| 108 | + INSERT OR REPLACE INTO registry (file_hash, metadata_json) |
| 109 | + VALUES (?, ?) |
| 110 | + """, (file_hash, metadata_json)) |
| 111 | + |
| 112 | + def _init_db(self) -> None: |
| 113 | + """Initialize database schema if not exists.""" |
| 114 | + self._path.parent.mkdir(parents=True, exist_ok=True) |
| 115 | + |
| 116 | + with self._connect() as conn: |
| 117 | + conn.execute("PRAGMA journal_mode=WAL") |
| 118 | + conn.execute("PRAGMA foreign_keys=ON") |
| 119 | + conn.execute(""" |
| 120 | + CREATE TABLE IF NOT EXISTS registry ( |
| 121 | + file_hash TEXT PRIMARY KEY, |
| 122 | + metadata_json TEXT NOT NULL, |
| 123 | + created_at TEXT DEFAULT CURRENT_TIMESTAMP, |
| 124 | + updated_at TEXT DEFAULT CURRENT_TIMESTAMP |
| 125 | + ) |
| 126 | + """) |
| 127 | + conn.execute(""" |
| 128 | + CREATE INDEX IF NOT EXISTS idx_created_at ON registry(created_at) |
| 129 | + """) |
| 130 | + |
| 131 | + @contextmanager |
| 132 | + def _connect(self) -> Iterator[sqlite3.Connection]: |
| 133 | + """Context manager for database connections.""" |
| 134 | + conn = sqlite3.connect(str(self._path)) |
| 135 | + try: |
| 136 | + yield conn |
| 137 | + conn.commit() |
| 138 | + finally: |
| 139 | + conn.close() |
| 140 | + |
| 141 | + def is_known(self, file_hash: str) -> bool: |
| 142 | + """Return True if file_hash is already registered.""" |
| 143 | + with self._connect() as conn: |
| 144 | + cursor = conn.execute( |
| 145 | + "SELECT 1 FROM registry WHERE file_hash = ?", |
| 146 | + (file_hash,) |
| 147 | + ) |
| 148 | + return cursor.fetchone() is not None |
| 149 | + |
| 150 | + def get(self, file_hash: str) -> dict | None: |
| 151 | + """Return metadata for file_hash, or None if not found.""" |
| 152 | + with self._connect() as conn: |
| 153 | + cursor = conn.execute( |
| 154 | + "SELECT metadata_json FROM registry WHERE file_hash = ?", |
| 155 | + (file_hash,) |
| 156 | + ) |
| 157 | + row = cursor.fetchone() |
| 158 | + if row is None: |
| 159 | + return None |
| 160 | + return json.loads(row[0]) |
| 161 | + |
| 162 | + def all_entries(self) -> dict[str, dict]: |
| 163 | + """Return a shallow copy of all hash -> metadata entries.""" |
| 164 | + with self._connect() as conn: |
| 165 | + cursor = conn.execute( |
| 166 | + "SELECT file_hash, metadata_json FROM registry" |
| 167 | + ) |
| 168 | + return { |
| 169 | + row[0]: json.loads(row[1]) |
| 170 | + for row in cursor.fetchall() |
| 171 | + } |
| 172 | + |
| 173 | + def add(self, file_hash: str, metadata: dict) -> None: |
| 174 | + """Register file_hash with metadata and persist to disk. |
| 175 | + |
| 176 | + If file_hash already exists, updates the metadata. |
| 177 | + """ |
| 178 | + metadata_json = json.dumps(metadata, ensure_ascii=False) |
| 179 | + with self._connect() as conn: |
| 180 | + conn.execute(""" |
| 181 | + INSERT INTO registry (file_hash, metadata_json, updated_at) |
| 182 | + VALUES (?, ?, CURRENT_TIMESTAMP) |
| 183 | + ON CONFLICT(file_hash) DO UPDATE SET |
| 184 | + metadata_json = excluded.metadata_json, |
| 185 | + updated_at = CURRENT_TIMESTAMP |
| 186 | + """, (file_hash, metadata_json)) |
| 187 | + |
| 188 | + @staticmethod |
| 189 | + def hash_file(path: Path) -> str: |
| 190 | + """Return the SHA-256 hex digest (64 chars) of the file at path.""" |
| 191 | + return _hash_file(path) |
| 192 | + |
| 193 | + |
| 194 | +def get_registry( |
| 195 | + openkb_dir: Path, |
| 196 | + backend: str = "sqlite", |
| 197 | +) -> HashRegistry | DbRegistry: |
| 198 | + """Factory function to get the appropriate registry implementation. |
| 199 | + |
| 200 | + Args: |
| 201 | + openkb_dir: Path to .openkb directory. |
| 202 | + backend: Storage backend - "sqlite" or "json". |
| 203 | + |
| 204 | + Returns: |
| 205 | + HashRegistry for "json" backend, DbRegistry for "sqlite" backend. |
| 206 | + |
| 207 | + When switching from json to sqlite and a JSON file exists, |
| 208 | + automatically migrates the data. |
| 209 | + """ |
| 210 | + if backend not in ("sqlite", "json"): |
| 211 | + raise ValueError(f"Unknown storage_backend: {backend!r}") |
| 212 | + |
| 213 | + if backend == "json": |
| 214 | + return HashRegistry(openkb_dir / "hashes.json") |
| 215 | + |
| 216 | + db_path = openkb_dir / "hashes.db" |
| 217 | + json_path = openkb_dir / "hashes.json" |
| 218 | + |
| 219 | + if json_path.exists() and not db_path.exists(): |
| 220 | + return DbRegistry(db_path, migrate_from=json_path) |
| 221 | + |
| 222 | + return DbRegistry(db_path) |
0 commit comments