Skip to content

Commit 21b1b41

Browse files
devwhodevsclaude
andcommitted
feat: FTS5 search lane — BM25 full-text search
Add SQLite FTS5 virtual table as second search lane. Populated during indexing alongside vector embeddings. Supports exact keyword matches for ticket IDs, names, and dates that semantic search misses. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 60ce0bd commit 21b1b41

4 files changed

Lines changed: 194 additions & 1 deletion

File tree

src/fts.rs

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
// FTS5 search support.
2+
//
3+
// The `FtsResult` struct and `fts_search` method live on `Store` (in store.rs)
4+
// since the store owns the database connection. We re-export `FtsResult` here
5+
// so downstream code can import it from either location.
6+
7+
pub use crate::store::FtsResult;
8+
9+
#[cfg(test)]
10+
mod tests {
11+
use crate::store::Store;
12+
use crate::docid::generate_docid;
13+
14+
fn setup_store() -> Store {
15+
let store = Store::open_memory().unwrap();
16+
store.ensure_fts_table().unwrap();
17+
store
18+
}
19+
20+
#[test]
21+
fn test_fts_exact_match() {
22+
let store = setup_store();
23+
let file_id = store
24+
.insert_file("notes/ticket.md", "hash1", 100, &[], &generate_docid("notes/ticket.md"))
25+
.unwrap();
26+
27+
store
28+
.insert_fts_chunk(file_id, 0, "BRE-2579 delivery date extension for checkout")
29+
.unwrap();
30+
31+
let results = store.fts_search("BRE-2579", 10).unwrap();
32+
assert_eq!(results.len(), 1);
33+
assert_eq!(results[0].file_id, file_id);
34+
assert_eq!(results[0].chunk_seq, 0);
35+
assert!(results[0].score > 0.0, "score should be positive (negated BM25)");
36+
}
37+
38+
#[test]
39+
fn test_fts_no_match() {
40+
let store = setup_store();
41+
let file_id = store
42+
.insert_file("notes/note.md", "hash1", 100, &[], &generate_docid("notes/note.md"))
43+
.unwrap();
44+
45+
store
46+
.insert_fts_chunk(file_id, 0, "Rust programming language guide")
47+
.unwrap();
48+
49+
let results = store.fts_search("kubernetes", 10).unwrap();
50+
assert_eq!(results.len(), 0);
51+
}
52+
53+
#[test]
54+
fn test_fts_multiple_results() {
55+
let store = setup_store();
56+
57+
let file_id1 = store
58+
.insert_file("notes/a.md", "h1", 100, &[], &generate_docid("notes/a.md"))
59+
.unwrap();
60+
let file_id2 = store
61+
.insert_file("notes/b.md", "h2", 100, &[], &generate_docid("notes/b.md"))
62+
.unwrap();
63+
let file_id3 = store
64+
.insert_file("notes/c.md", "h3", 100, &[], &generate_docid("notes/c.md"))
65+
.unwrap();
66+
67+
// Chunk with "delivery" appearing multiple times should rank higher.
68+
store
69+
.insert_fts_chunk(file_id1, 0, "delivery date delivery schedule delivery tracking")
70+
.unwrap();
71+
store
72+
.insert_fts_chunk(file_id2, 0, "delivery date for the checkout page")
73+
.unwrap();
74+
store
75+
.insert_fts_chunk(file_id3, 0, "unrelated content about Rust and WebAssembly")
76+
.unwrap();
77+
78+
let results = store.fts_search("delivery", 10).unwrap();
79+
assert_eq!(results.len(), 2, "only 2 chunks mention 'delivery'");
80+
81+
// Results should be sorted by score descending.
82+
assert!(
83+
results[0].score >= results[1].score,
84+
"results should be ranked by relevance"
85+
);
86+
}
87+
88+
#[test]
89+
fn test_fts_delete_chunks_for_file() {
90+
let store = setup_store();
91+
let file_id = store
92+
.insert_file("notes/del.md", "hash1", 100, &[], &generate_docid("notes/del.md"))
93+
.unwrap();
94+
95+
store.insert_fts_chunk(file_id, 0, "first chunk content").unwrap();
96+
store.insert_fts_chunk(file_id, 1, "second chunk content").unwrap();
97+
98+
// Verify they exist.
99+
let results = store.fts_search("chunk", 10).unwrap();
100+
assert_eq!(results.len(), 2);
101+
102+
// Delete and verify gone.
103+
store.delete_fts_chunks_for_file(file_id).unwrap();
104+
let results = store.fts_search("chunk", 10).unwrap();
105+
assert_eq!(results.len(), 0);
106+
}
107+
}

src/indexer.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result<In
167167
if !vector_ids.is_empty() {
168168
store.add_tombstones(&vector_ids)?;
169169
}
170+
store.delete_fts_chunks_for_file(record.id)?;
170171
store.delete_file(record.id)?;
171172
}
172173

@@ -180,6 +181,7 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result<In
180181
if !vector_ids.is_empty() {
181182
store.add_tombstones(&vector_ids)?;
182183
}
184+
store.delete_fts_chunks_for_file(record.id)?;
183185
store.delete_file(record.id)?;
184186
}
185187
files_to_index.push(file_path.clone());
@@ -295,7 +297,7 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result<In
295297
let file_id =
296298
store.insert_file(&result.rel_path, &result.hash, result.mtime, &result.tags, &docid)?;
297299

298-
for (heading, snippet, vector, token_count) in &result.chunks {
300+
for (chunk_seq, (heading, snippet, vector, token_count)) in result.chunks.iter().enumerate() {
299301
let vector_id = next_vector_id;
300302
next_vector_id += 1;
301303
store.insert_chunk_with_vector(
@@ -306,6 +308,7 @@ pub fn run_index(vault_path: &Path, config: &Config, rebuild: bool) -> Result<In
306308
*token_count as i64,
307309
vector,
308310
)?;
311+
store.insert_fts_chunk(file_id, chunk_seq as i64, snippet)?;
309312
}
310313
}
311314

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ pub mod chunker;
22
pub mod config;
33
pub mod docid;
44
pub mod embedder;
5+
pub mod fts;
56
pub mod hnsw;
67
pub mod indexer;
78
pub mod search;

src/store.rs

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,15 @@ pub struct ChunkRecord {
2626
pub token_count: i64,
2727
}
2828

29+
/// A single result from an FTS5 full-text search.
30+
#[derive(Debug, Clone)]
31+
pub struct FtsResult {
32+
pub file_id: i64,
33+
pub chunk_seq: i64,
34+
pub score: f64,
35+
pub snippet: String,
36+
}
37+
2938
/// Summary statistics for the store.
3039
#[derive(Debug)]
3140
pub struct StoreStats {
@@ -100,6 +109,7 @@ impl Store {
100109
.execute_batch(SCHEMA)
101110
.context("failed to initialize schema")?;
102111
self.migrate()?;
112+
self.ensure_fts_table()?;
103113
Ok(())
104114
}
105115

@@ -439,6 +449,78 @@ impl Store {
439449
}
440450
}
441451

452+
// ── FTS5 ──────────────────────────────────────────────────
453+
454+
/// Ensure the FTS5 virtual table exists. Called during init.
455+
pub fn ensure_fts_table(&self) -> Result<()> {
456+
self.conn.execute_batch(
457+
"CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
458+
content,
459+
file_id UNINDEXED,
460+
chunk_seq UNINDEXED
461+
);",
462+
).context("failed to create FTS5 virtual table")?;
463+
Ok(())
464+
}
465+
466+
/// Insert a chunk's text into the FTS5 table.
467+
pub fn insert_fts_chunk(&self, file_id: i64, chunk_seq: i64, text: &str) -> Result<()> {
468+
self.conn.execute(
469+
"INSERT INTO chunks_fts (content, file_id, chunk_seq) VALUES (?1, ?2, ?3)",
470+
params![text, file_id, chunk_seq],
471+
)?;
472+
Ok(())
473+
}
474+
475+
/// Delete all FTS5 entries for a file.
476+
pub fn delete_fts_chunks_for_file(&self, file_id: i64) -> Result<()> {
477+
self.conn.execute(
478+
"DELETE FROM chunks_fts WHERE file_id = ?1",
479+
params![file_id],
480+
)?;
481+
Ok(())
482+
}
483+
484+
/// Search the FTS5 index. Returns results ranked by BM25 score.
485+
/// BM25 in SQLite returns negative values (more negative = better match),
486+
/// so we negate them to get positive scores where higher = better.
487+
///
488+
/// The query is wrapped in double quotes so that FTS5 treats it as a
489+
/// phrase/literal rather than interpreting operators like `-`.
490+
pub fn fts_search(&self, query: &str, limit: usize) -> Result<Vec<FtsResult>> {
491+
// Escape any double quotes in the query, then wrap in double quotes
492+
// so FTS5 treats hyphens etc. as literal characters.
493+
let escaped = query.replace('"', "\"\"");
494+
let fts_query = format!("\"{}\"", escaped);
495+
496+
let mut stmt = self.conn.prepare(
497+
"SELECT file_id, chunk_seq, bm25(chunks_fts) as score,
498+
snippet(chunks_fts, 0, '<b>', '</b>', '...', 64)
499+
FROM chunks_fts
500+
WHERE chunks_fts MATCH ?1
501+
ORDER BY score
502+
LIMIT ?2",
503+
)?;
504+
505+
let rows = stmt.query_map(params![fts_query, limit as i64], |row| {
506+
Ok(FtsResult {
507+
file_id: row.get(0)?,
508+
chunk_seq: row.get(1)?,
509+
score: {
510+
let raw: f64 = row.get(2)?;
511+
-raw // negate: SQLite BM25 returns negative, more negative = better
512+
},
513+
snippet: row.get(3)?,
514+
})
515+
})?;
516+
517+
let mut results = Vec::new();
518+
for row in rows {
519+
results.push(row?);
520+
}
521+
Ok(results)
522+
}
523+
442524
/// Return vector_ids for all chunks belonging to a file.
443525
/// Useful for tombstoning before re-indexing a changed file.
444526
pub fn get_vector_ids_for_file(&self, file_id: i64) -> Result<Vec<u64>> {

0 commit comments

Comments
 (0)