Skip to content

Commit 7991a2d

Browse files
devwhodevsclaude
andcommitted
fix: store vectors in SQLite and rebuild HNSW on each index
hnsw_rs doesn't support appending vectors after loading from disk. Fix by storing embedding vectors as BLOBs in the chunks table and rebuilding the HNSW index from scratch on each indexing run. Also pins the MODEL_SHA256 hash for integrity verification. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 8e0764c commit 7991a2d

5 files changed

Lines changed: 101 additions & 41 deletions

File tree

src/embedder.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ const TOKENIZER_URL: &str =
1616
"https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer.json";
1717
/// SHA-256 of the ONNX model file. Set to empty string to skip verification
1818
/// until we can compute the real hash from a download.
19-
const MODEL_SHA256: &str = "";
19+
const MODEL_SHA256: &str = "6fd5d72fe4589f189f8ebc006442dbb529bb7ce38f8082112682524616046452";
2020
pub const EMBEDDING_DIM: usize = 384;
2121

2222
pub struct Embedder {
@@ -27,7 +27,6 @@ pub struct Embedder {
2727
impl Embedder {
2828
/// Create a new Embedder, downloading the model and tokenizer into
2929
/// `models_dir` if they are not already present.
30-
#[allow(clippy::const_is_empty)] // MODEL_SHA256 is empty until a known hash is pinned.
3130
pub fn new(models_dir: &Path) -> Result<Self> {
3231
std::fs::create_dir_all(models_dir)
3332
.with_context(|| format!("creating models dir {}", models_dir.display()))?;

src/hnsw.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,20 @@ impl HnswIndex {
6363
id
6464
}
6565

66+
/// Insert a vector with a specific ID (used when rebuilding from stored vectors).
67+
pub fn insert_with_id(&mut self, vector: &[f32], id: u64) {
68+
assert_eq!(
69+
vector.len(),
70+
EMBEDDING_DIM,
71+
"vector dimension mismatch: expected {EMBEDDING_DIM}, got {}",
72+
vector.len()
73+
);
74+
self.inner.insert((vector, id as usize));
75+
if id >= self.next_id {
76+
self.next_id = id + 1;
77+
}
78+
}
79+
6680
/// Insert a batch of vectors and return their assigned vector IDs.
6781
pub fn insert_batch(&mut self, vectors: &[Vec<f32>]) -> Vec<u64> {
6882
vectors.iter().map(|v| self.insert(v)).collect()

src/indexer.rs

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use anyhow::{Context, Result};
66
use ignore::WalkBuilder;
77
use rayon::prelude::*;
88
use sha2::{Digest, Sha256};
9-
use tracing::{info, warn};
9+
use tracing::info;
1010

1111
use crate::chunker::{chunk_markdown, split_oversized_chunks};
1212
use crate::config::Config;
@@ -141,14 +141,6 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result<In
141141
let store = Store::open(&db_path)?;
142142

143143
let hnsw_dir = data_dir.join("hnsw");
144-
let mut hnsw = if rebuild || !hnsw_dir.join("engraph.hnsw.data").exists() {
145-
HnswIndex::new(100_000)
146-
} else {
147-
HnswIndex::load(&hnsw_dir).unwrap_or_else(|e| {
148-
warn!("failed to load HNSW index, creating new: {e:#}");
149-
HnswIndex::new(100_000)
150-
})
151-
};
152144

153145
// If rebuild, treat everything as new.
154146
let files = walk_vault(vault_path, &config.exclude)?;
@@ -286,14 +278,23 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result<In
286278
});
287279
}
288280

289-
// Step 8: Serial write — insert files + chunks into store, vectors into HNSW.
281+
// Step 8: Serial write — insert files + chunks into store with vectors.
282+
let mut next_vector_id: u64 = {
283+
// Get the max existing vector_id to avoid collisions.
284+
let all_existing = store.get_all_vectors().unwrap_or_default();
285+
all_existing.iter().map(|(id, _)| *id).max().map_or(0, |m| m + 1)
286+
};
287+
290288
for result in &results {
291289
let file_id =
292290
store.insert_file(&result.rel_path, &result.hash, result.mtime, &result.tags)?;
293291

294292
for (heading, snippet, vector, token_count) in &result.chunks {
295-
let vector_id = hnsw.insert(vector);
296-
store.insert_chunk(file_id, heading, snippet, vector_id, *token_count as i64)?;
293+
let vector_id = next_vector_id;
294+
next_vector_id += 1;
295+
store.insert_chunk_with_vector(
296+
file_id, heading, snippet, vector_id, *token_count as i64, vector,
297+
)?;
297298
}
298299
}
299300

@@ -310,16 +311,19 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result<In
310311
),
311312
)?;
312313

313-
// Step 10: Check tombstone ratio, auto-rebuild if >20%.
314-
let stats = store.stats()?;
315-
let total_vectors = stats.chunk_count + stats.tombstone_count;
316-
if total_vectors > 0 && stats.tombstone_count * 100 / total_vectors > 20 {
317-
info!(
318-
tombstone_ratio = format!("{}%", stats.tombstone_count * 100 / total_vectors),
319-
"tombstone ratio exceeds 20%, consider running with --rebuild"
320-
);
314+
// Step 10: Rebuild HNSW index from all vectors in SQLite.
315+
// hnsw_rs doesn't support appending after load, so we always rebuild.
316+
let all_vectors = store.get_all_vectors()?;
317+
let mut hnsw = HnswIndex::new(all_vectors.len().max(1000));
318+
for (vid, vector) in &all_vectors {
319+
hnsw.insert_with_id(vector, *vid);
321320
}
322321

322+
info!(
323+
vectors = all_vectors.len(),
324+
"rebuilt HNSW index from stored vectors"
325+
);
326+
323327
// Step 11: Save HNSW index to disk.
324328
hnsw.save(&hnsw_dir)?;
325329

src/store.rs

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ CREATE TABLE IF NOT EXISTS chunks (
5858
heading TEXT NOT NULL,
5959
snippet TEXT NOT NULL,
6060
vector_id INTEGER UNIQUE NOT NULL,
61-
token_count INTEGER NOT NULL
61+
token_count INTEGER NOT NULL,
62+
vector BLOB
6263
);
6364
6465
CREATE TABLE IF NOT EXISTS tombstones (
@@ -200,6 +201,47 @@ impl Store {
200201
Ok(())
201202
}
202203

204+
/// Insert a chunk with its embedding vector stored as a BLOB.
205+
pub fn insert_chunk_with_vector(
206+
&self,
207+
file_id: i64,
208+
heading: &str,
209+
snippet: &str,
210+
vector_id: u64,
211+
token_count: i64,
212+
vector: &[f32],
213+
) -> Result<()> {
214+
let vector_bytes: Vec<u8> = vector.iter().flat_map(|f| f.to_le_bytes()).collect();
215+
self.conn.execute(
216+
"INSERT INTO chunks (file_id, heading, snippet, vector_id, token_count, vector)
217+
VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
218+
params![file_id, heading, snippet, vector_id as i64, token_count, vector_bytes],
219+
)?;
220+
Ok(())
221+
}
222+
223+
/// Get all stored vectors with their IDs for HNSW index rebuild.
224+
/// Returns (vector_id, vector) pairs.
225+
pub fn get_all_vectors(&self) -> Result<Vec<(u64, Vec<f32>)>> {
226+
let mut stmt = self.conn.prepare(
227+
"SELECT vector_id, vector FROM chunks WHERE vector IS NOT NULL"
228+
)?;
229+
let rows = stmt.query_map([], |row| {
230+
let vid: i64 = row.get(0)?;
231+
let blob: Vec<u8> = row.get(1)?;
232+
let vector: Vec<f32> = blob
233+
.chunks_exact(4)
234+
.map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
235+
.collect();
236+
Ok((vid as u64, vector))
237+
})?;
238+
let mut results = Vec::new();
239+
for row in rows {
240+
results.push(row?);
241+
}
242+
Ok(results)
243+
}
244+
203245
pub fn get_chunks_by_file(&self, file_id: i64) -> Result<Vec<ChunkRecord>> {
204246
let mut stmt = self.conn.prepare(
205247
"SELECT id, file_id, heading, snippet, vector_id, token_count

tests/integration.rs

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,6 @@ fn index_vault(vault_path: &Path, data_dir: &Path, config: &Config, rebuild: boo
5454
let store = Store::open(&db_path).unwrap();
5555
let hnsw_dir = data_dir.join("hnsw");
5656

57-
let mut hnsw = if rebuild || !hnsw_dir.join("engraph.hnsw.data").exists() {
58-
HnswIndex::new(100_000)
59-
} else {
60-
HnswIndex::load(&hnsw_dir).unwrap_or_else(|_| HnswIndex::new(100_000))
61-
};
62-
6357
let files = walk_vault(vault_path, &config.exclude).unwrap();
6458

6559
let (new_files, changed_files, deleted_files) = if rebuild {
@@ -70,32 +64,32 @@ fn index_vault(vault_path: &Path, data_dir: &Path, config: &Config, rebuild: boo
7064

7165
// Handle deletes.
7266
for record in &deleted_files {
73-
let vector_ids = store.get_vector_ids_for_file(record.id).unwrap();
74-
if !vector_ids.is_empty() {
75-
store.add_tombstones(&vector_ids).unwrap();
76-
}
7767
store.delete_file(record.id).unwrap();
7868
}
7969

80-
// Handle updates (tombstone + re-index).
70+
// Handle updates (delete old record, then treat as new).
8171
let mut files_to_index: Vec<PathBuf> = new_files.clone();
8272
for file_path in &changed_files {
8373
let rel = file_path.strip_prefix(vault_path).unwrap_or(file_path);
8474
let rel_str = rel.to_string_lossy().to_string();
8575
if let Some(record) = store.get_file(&rel_str).unwrap() {
86-
let vector_ids = store.get_vector_ids_for_file(record.id).unwrap();
87-
if !vector_ids.is_empty() {
88-
store.add_tombstones(&vector_ids).unwrap();
89-
}
9076
store.delete_file(record.id).unwrap();
9177
}
9278
files_to_index.push(file_path.clone());
9379
}
9480

95-
// Embed and store.
81+
// Embed and store with vectors.
9682
let models_dir = data_dir.join("models");
9783
let mut embedder = Embedder::new(&models_dir).unwrap();
9884

85+
let mut next_vid: u64 = store
86+
.get_all_vectors()
87+
.unwrap_or_default()
88+
.iter()
89+
.map(|(id, _)| *id)
90+
.max()
91+
.map_or(0, |m| m + 1);
92+
9993
for file_path in &files_to_index {
10094
let content = std::fs::read_to_string(file_path).unwrap();
10195
let rel = file_path.strip_prefix(vault_path).unwrap_or(file_path);
@@ -111,10 +105,11 @@ fn index_vault(vault_path: &Path, data_dir: &Path, config: &Config, rebuild: boo
111105
for chunk in &chunks {
112106
let heading = chunk.heading.clone().unwrap_or_default();
113107
let vec = embedder.embed_one(&chunk.text).unwrap();
114-
let vector_id = hnsw.insert(&vec);
115108
let token_count = embedder.token_count(&chunk.text) as i64;
109+
let vector_id = next_vid;
110+
next_vid += 1;
116111
store
117-
.insert_chunk(file_id, &heading, &chunk.snippet, vector_id, token_count)
112+
.insert_chunk_with_vector(file_id, &heading, &chunk.snippet, vector_id, token_count, &vec)
118113
.unwrap();
119114
}
120115
}
@@ -135,6 +130,12 @@ fn index_vault(vault_path: &Path, data_dir: &Path, config: &Config, rebuild: boo
135130
)
136131
.unwrap();
137132

133+
// Rebuild HNSW from all vectors in SQLite (hnsw_rs doesn't support append after load).
134+
let all_vectors = store.get_all_vectors().unwrap();
135+
let mut hnsw = HnswIndex::new(all_vectors.len().max(1000));
136+
for (vid, vector) in &all_vectors {
137+
hnsw.insert_with_id(vector, *vid);
138+
}
138139
hnsw.save(&hnsw_dir).unwrap();
139140

140141
files_to_index.len()

0 commit comments

Comments
 (0)