fix: store vectors in SQLite and rebuild HNSW on each index

devwhodevs · claude · devwhodevs · commit 7991a2d11520 · 2026-03-19T15:29:03.000+02:00
hnsw_rs doesn't support appending vectors after loading from disk.
Fix by storing embedding vectors as BLOBs in the chunks table and
rebuilding the HNSW index from scratch on each indexing run. Also
pins the MODEL_SHA256 hash for integrity verification.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/embedder.rs b/src/embedder.rs
@@ -16,7 +16,7 @@ const TOKENIZER_URL: &str =
     "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer.json";
 /// SHA-256 of the ONNX model file. Set to empty string to skip verification
 /// until we can compute the real hash from a download.
-const MODEL_SHA256: &str = "";
+const MODEL_SHA256: &str = "6fd5d72fe4589f189f8ebc006442dbb529bb7ce38f8082112682524616046452";
 pub const EMBEDDING_DIM: usize = 384;
 
 pub struct Embedder {
@@ -27,7 +27,6 @@ pub struct Embedder {
 impl Embedder {
     /// Create a new Embedder, downloading the model and tokenizer into
     /// `models_dir` if they are not already present.
-    #[allow(clippy::const_is_empty)] // MODEL_SHA256 is empty until a known hash is pinned.
     pub fn new(models_dir: &Path) -> Result<Self> {
         std::fs::create_dir_all(models_dir)
             .with_context(|| format!("creating models dir {}", models_dir.display()))?;
diff --git a/src/hnsw.rs b/src/hnsw.rs
@@ -63,6 +63,20 @@ impl HnswIndex {
         id
     }
 
+    /// Insert a vector with a specific ID (used when rebuilding from stored vectors).
+    pub fn insert_with_id(&mut self, vector: &[f32], id: u64) {
+        assert_eq!(
+            vector.len(),
+            EMBEDDING_DIM,
+            "vector dimension mismatch: expected {EMBEDDING_DIM}, got {}",
+            vector.len()
+        );
+        self.inner.insert((vector, id as usize));
+        if id >= self.next_id {
+            self.next_id = id + 1;
+        }
+    }
+
     /// Insert a batch of vectors and return their assigned vector IDs.
     pub fn insert_batch(&mut self, vectors: &[Vec<f32>]) -> Vec<u64> {
         vectors.iter().map(|v| self.insert(v)).collect()
diff --git a/src/indexer.rs b/src/indexer.rs
@@ -6,7 +6,7 @@ use anyhow::{Context, Result};
 use ignore::WalkBuilder;
 use rayon::prelude::*;
 use sha2::{Digest, Sha256};
-use tracing::{info, warn};
+use tracing::info;
 
 use crate::chunker::{chunk_markdown, split_oversized_chunks};
 use crate::config::Config;
@@ -141,14 +141,6 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result<In
     let store = Store::open(&db_path)?;
 
     let hnsw_dir = data_dir.join("hnsw");
-    let mut hnsw = if rebuild || !hnsw_dir.join("engraph.hnsw.data").exists() {
-        HnswIndex::new(100_000)
-    } else {
-        HnswIndex::load(&hnsw_dir).unwrap_or_else(|e| {
-            warn!("failed to load HNSW index, creating new: {e:#}");
-            HnswIndex::new(100_000)
-        })
-    };
 
     // If rebuild, treat everything as new.
     let files = walk_vault(vault_path, &config.exclude)?;
@@ -286,14 +278,23 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result<In
         });
     }
 
-    // Step 8: Serial write — insert files + chunks into store, vectors into HNSW.
+    // Step 8: Serial write — insert files + chunks into store with vectors.
+    let mut next_vector_id: u64 = {
+        // Get the max existing vector_id to avoid collisions.
+        let all_existing = store.get_all_vectors().unwrap_or_default();
+        all_existing.iter().map(|(id, _)| *id).max().map_or(0, |m| m + 1)
+    };
+
     for result in &results {
         let file_id =
             store.insert_file(&result.rel_path, &result.hash, result.mtime, &result.tags)?;
 
         for (heading, snippet, vector, token_count) in &result.chunks {
-            let vector_id = hnsw.insert(vector);
-            store.insert_chunk(file_id, heading, snippet, vector_id, *token_count as i64)?;
+            let vector_id = next_vector_id;
+            next_vector_id += 1;
+            store.insert_chunk_with_vector(
+                file_id, heading, snippet, vector_id, *token_count as i64, vector,
+            )?;
         }
     }
 
@@ -310,16 +311,19 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result<In
         ),
     )?;
 
-    // Step 10: Check tombstone ratio, auto-rebuild if >20%.
-    let stats = store.stats()?;
-    let total_vectors = stats.chunk_count + stats.tombstone_count;
-    if total_vectors > 0 && stats.tombstone_count * 100 / total_vectors > 20 {
-        info!(
-            tombstone_ratio = format!("{}%", stats.tombstone_count * 100 / total_vectors),
-            "tombstone ratio exceeds 20%, consider running with --rebuild"
-        );
+    // Step 10: Rebuild HNSW index from all vectors in SQLite.
+    // hnsw_rs doesn't support appending after load, so we always rebuild.
+    let all_vectors = store.get_all_vectors()?;
+    let mut hnsw = HnswIndex::new(all_vectors.len().max(1000));
+    for (vid, vector) in &all_vectors {
+        hnsw.insert_with_id(vector, *vid);
     }
 
+    info!(
+        vectors = all_vectors.len(),
+        "rebuilt HNSW index from stored vectors"
+    );
+
     // Step 11: Save HNSW index to disk.
     hnsw.save(&hnsw_dir)?;
 
diff --git a/src/store.rs b/src/store.rs
@@ -58,7 +58,8 @@ CREATE TABLE IF NOT EXISTS chunks (
     heading     TEXT NOT NULL,
     snippet     TEXT NOT NULL,
     vector_id   INTEGER UNIQUE NOT NULL,
-    token_count INTEGER NOT NULL
+    token_count INTEGER NOT NULL,
+    vector      BLOB
 );
 
 CREATE TABLE IF NOT EXISTS tombstones (
@@ -200,6 +201,47 @@ impl Store {
         Ok(())
     }
 
+    /// Insert a chunk with its embedding vector stored as a BLOB.
+    pub fn insert_chunk_with_vector(
+        &self,
+        file_id: i64,
+        heading: &str,
+        snippet: &str,
+        vector_id: u64,
+        token_count: i64,
+        vector: &[f32],
+    ) -> Result<()> {
+        let vector_bytes: Vec<u8> = vector.iter().flat_map(|f| f.to_le_bytes()).collect();
+        self.conn.execute(
+            "INSERT INTO chunks (file_id, heading, snippet, vector_id, token_count, vector)
+             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
+            params![file_id, heading, snippet, vector_id as i64, token_count, vector_bytes],
+        )?;
+        Ok(())
+    }
+
+    /// Get all stored vectors with their IDs for HNSW index rebuild.
+    /// Returns (vector_id, vector) pairs.
+    pub fn get_all_vectors(&self) -> Result<Vec<(u64, Vec<f32>)>> {
+        let mut stmt = self.conn.prepare(
+            "SELECT vector_id, vector FROM chunks WHERE vector IS NOT NULL"
+        )?;
+        let rows = stmt.query_map([], |row| {
+            let vid: i64 = row.get(0)?;
+            let blob: Vec<u8> = row.get(1)?;
+            let vector: Vec<f32> = blob
+                .chunks_exact(4)
+                .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
+                .collect();
+            Ok((vid as u64, vector))
+        })?;
+        let mut results = Vec::new();
+        for row in rows {
+            results.push(row?);
+        }
+        Ok(results)
+    }
+
     pub fn get_chunks_by_file(&self, file_id: i64) -> Result<Vec<ChunkRecord>> {
         let mut stmt = self.conn.prepare(
             "SELECT id, file_id, heading, snippet, vector_id, token_count
diff --git a/tests/integration.rs b/tests/integration.rs
@@ -54,12 +54,6 @@ fn index_vault(vault_path: &Path, data_dir: &Path, config: &Config, rebuild: boo
     let store = Store::open(&db_path).unwrap();
     let hnsw_dir = data_dir.join("hnsw");
 
-    let mut hnsw = if rebuild || !hnsw_dir.join("engraph.hnsw.data").exists() {
-        HnswIndex::new(100_000)
-    } else {
-        HnswIndex::load(&hnsw_dir).unwrap_or_else(|_| HnswIndex::new(100_000))
-    };
-
     let files = walk_vault(vault_path, &config.exclude).unwrap();
 
     let (new_files, changed_files, deleted_files) = if rebuild {
@@ -70,32 +64,32 @@ fn index_vault(vault_path: &Path, data_dir: &Path, config: &Config, rebuild: boo
 
     // Handle deletes.
     for record in &deleted_files {
-        let vector_ids = store.get_vector_ids_for_file(record.id).unwrap();
-        if !vector_ids.is_empty() {
-            store.add_tombstones(&vector_ids).unwrap();
-        }
         store.delete_file(record.id).unwrap();
     }
 
-    // Handle updates (tombstone + re-index).
+    // Handle updates (delete old record, then treat as new).
     let mut files_to_index: Vec<PathBuf> = new_files.clone();
     for file_path in &changed_files {
         let rel = file_path.strip_prefix(vault_path).unwrap_or(file_path);
         let rel_str = rel.to_string_lossy().to_string();
         if let Some(record) = store.get_file(&rel_str).unwrap() {
-            let vector_ids = store.get_vector_ids_for_file(record.id).unwrap();
-            if !vector_ids.is_empty() {
-                store.add_tombstones(&vector_ids).unwrap();
-            }
             store.delete_file(record.id).unwrap();
         }
         files_to_index.push(file_path.clone());
     }
 
-    // Embed and store.
+    // Embed and store with vectors.
     let models_dir = data_dir.join("models");
     let mut embedder = Embedder::new(&models_dir).unwrap();
 
+    let mut next_vid: u64 = store
+        .get_all_vectors()
+        .unwrap_or_default()
+        .iter()
+        .map(|(id, _)| *id)
+        .max()
+        .map_or(0, |m| m + 1);
+
     for file_path in &files_to_index {
         let content = std::fs::read_to_string(file_path).unwrap();
         let rel = file_path.strip_prefix(vault_path).unwrap_or(file_path);
@@ -111,10 +105,11 @@ fn index_vault(vault_path: &Path, data_dir: &Path, config: &Config, rebuild: boo
         for chunk in &chunks {
             let heading = chunk.heading.clone().unwrap_or_default();
             let vec = embedder.embed_one(&chunk.text).unwrap();
-            let vector_id = hnsw.insert(&vec);
             let token_count = embedder.token_count(&chunk.text) as i64;
+            let vector_id = next_vid;
+            next_vid += 1;
             store
-                .insert_chunk(file_id, &heading, &chunk.snippet, vector_id, token_count)
+                .insert_chunk_with_vector(file_id, &heading, &chunk.snippet, vector_id, token_count, &vec)
                 .unwrap();
         }
     }
@@ -135,6 +130,12 @@ fn index_vault(vault_path: &Path, data_dir: &Path, config: &Config, rebuild: boo
         )
         .unwrap();
 
+    // Rebuild HNSW from all vectors in SQLite (hnsw_rs doesn't support append after load).
+    let all_vectors = store.get_all_vectors().unwrap();
+    let mut hnsw = HnswIndex::new(all_vectors.len().max(1000));
+    for (vid, vector) in &all_vectors {
+        hnsw.insert_with_id(vector, *vid);
+    }
     hnsw.save(&hnsw_dir).unwrap();
 
     files_to_index.len()