Skip to content

Commit ce8fd29

Browse files
devwhodevsclaude
andcommitted
feat: search pipeline with human/JSON output and status formatting
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 9b44e50 commit ce8fd29

3 files changed

Lines changed: 333 additions & 0 deletions

File tree

src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ mod config;
33
mod embedder;
44
mod hnsw;
55
mod indexer;
6+
mod search;
67
mod store;
78

89
use anyhow::Result;

src/search.rs

Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
use std::path::Path;
2+
3+
use anyhow::{Context, Result};
4+
use serde_json::json;
5+
6+
use crate::embedder::Embedder;
7+
use crate::hnsw::HnswIndex;
8+
use crate::store::{Store, StoreStats};
9+
10+
/// A single search result with metadata.
11+
pub struct SearchResult {
12+
pub score: f32,
13+
pub file_path: String,
14+
pub heading: Option<String>,
15+
pub snippet: String,
16+
}
17+
18+
/// Run a search query and print results.
19+
pub fn run_search(query: &str, top_n: usize, json: bool, data_dir: &Path) -> Result<()> {
20+
let models_dir = data_dir.join("models");
21+
let mut embedder = Embedder::new(&models_dir).context("loading embedder")?;
22+
23+
let hnsw_dir = data_dir.join("hnsw");
24+
let index = HnswIndex::load(&hnsw_dir).context("loading HNSW index")?;
25+
26+
let db_path = data_dir.join("store.db");
27+
let store = Store::open(&db_path).context("opening store")?;
28+
29+
let query_vec = embedder.embed_one(query).context("embedding query")?;
30+
31+
let tombstones = store.get_tombstones().context("loading tombstones")?;
32+
33+
// Request extra results to account for tombstone filtering.
34+
let raw_results = index.search(&query_vec, top_n, &tombstones);
35+
36+
let mut results = Vec::new();
37+
for (vector_id, distance) in raw_results {
38+
if let Some(chunk) = store.get_chunk_by_vector_id(vector_id)? {
39+
let file_path = store
40+
.get_file_path_by_id(chunk.file_id)?
41+
.unwrap_or_else(|| "<unknown>".to_string());
42+
43+
// Convert cosine distance to similarity score.
44+
let score = 1.0 - distance;
45+
let heading = if chunk.heading.is_empty() {
46+
None
47+
} else {
48+
Some(chunk.heading)
49+
};
50+
51+
results.push(SearchResult {
52+
score,
53+
file_path,
54+
heading,
55+
snippet: chunk.snippet,
56+
});
57+
}
58+
}
59+
60+
let output = format_results(&results, json);
61+
print!("{output}");
62+
Ok(())
63+
}
64+
65+
/// Run the status command and print index information.
66+
pub fn run_status(json: bool, data_dir: &Path) -> Result<()> {
67+
let db_path = data_dir.join("store.db");
68+
let store = Store::open(&db_path).context("opening store")?;
69+
let stats = store.stats()?;
70+
71+
// Compute index size on disk (sum of HNSW files).
72+
let hnsw_dir = data_dir.join("hnsw");
73+
let index_size = dir_size(&hnsw_dir);
74+
75+
let model_name = "all-MiniLM-L6-v2";
76+
77+
let output = format_status(&stats, index_size, model_name, json);
78+
print!("{output}");
79+
Ok(())
80+
}
81+
82+
/// Format search results for display (pure function, no I/O).
83+
pub fn format_results(results: &[SearchResult], json: bool) -> String {
84+
if results.is_empty() {
85+
return "No results found.\n".to_string();
86+
}
87+
88+
if json {
89+
let items: Vec<serde_json::Value> = results
90+
.iter()
91+
.enumerate()
92+
.map(|(i, r)| {
93+
// Round score to 2 decimal places via f64 to avoid f32 precision artifacts.
94+
let score_rounded = ((r.score as f64) * 100.0).round() / 100.0;
95+
json!({
96+
"rank": i + 1,
97+
"score": score_rounded,
98+
"file": r.file_path,
99+
"heading": r.heading,
100+
"snippet": r.snippet,
101+
})
102+
})
103+
.collect();
104+
format!("{}\n", serde_json::to_string_pretty(&items).unwrap())
105+
} else {
106+
let mut out = String::new();
107+
for (i, r) in results.iter().enumerate() {
108+
let heading_part = match &r.heading {
109+
Some(h) => format!(" > {h}"),
110+
None => String::new(),
111+
};
112+
let snippet = truncate_snippet(&r.snippet, 200);
113+
out.push_str(&format!(
114+
"{:>2}. [{:.2}] {}{}\n {}\n",
115+
i + 1,
116+
r.score,
117+
r.file_path,
118+
heading_part,
119+
snippet,
120+
));
121+
}
122+
out
123+
}
124+
}
125+
126+
/// Format status information for display (pure function, no I/O).
127+
pub fn format_status(stats: &StoreStats, index_size: u64, model_name: &str, json: bool) -> String {
128+
let vault = stats
129+
.vault_path
130+
.as_deref()
131+
.unwrap_or("<not set>");
132+
let last_indexed = stats
133+
.last_indexed_at
134+
.as_deref()
135+
.unwrap_or("never");
136+
137+
if json {
138+
let obj = json!({
139+
"vault": vault,
140+
"files": stats.file_count,
141+
"chunks": stats.chunk_count,
142+
"tombstones": stats.tombstone_count,
143+
"last_indexed": last_indexed,
144+
"index_size": index_size,
145+
"model": model_name,
146+
});
147+
format!("{}\n", serde_json::to_string_pretty(&obj).unwrap())
148+
} else {
149+
format!(
150+
"Vault: {}\n\
151+
Files: {}\n\
152+
Chunks: {}\n\
153+
Tombstones: {} (pending cleanup)\n\
154+
Last index: {}\n\
155+
Index size: {}\n\
156+
Model: {}\n",
157+
vault,
158+
stats.file_count,
159+
stats.chunk_count,
160+
stats.tombstone_count,
161+
last_indexed,
162+
format_bytes(index_size),
163+
model_name,
164+
)
165+
}
166+
}
167+
168+
/// Truncate a string to at most `max_len` characters, appending "..." if truncated.
169+
fn truncate_snippet(s: &str, max_len: usize) -> String {
170+
if s.len() <= max_len {
171+
s.to_string()
172+
} else {
173+
// Find a char boundary near max_len.
174+
let mut end = max_len;
175+
while end > 0 && !s.is_char_boundary(end) {
176+
end -= 1;
177+
}
178+
format!("{}...", &s[..end])
179+
}
180+
}
181+
182+
/// Format a byte count as a human-readable string.
183+
fn format_bytes(bytes: u64) -> String {
184+
const KB: u64 = 1024;
185+
const MB: u64 = 1024 * 1024;
186+
const GB: u64 = 1024 * 1024 * 1024;
187+
188+
if bytes >= GB {
189+
format!("{:.1} GB", bytes as f64 / GB as f64)
190+
} else if bytes >= MB {
191+
format!("{:.1} MB", bytes as f64 / MB as f64)
192+
} else if bytes >= KB {
193+
format!("{:.1} KB", bytes as f64 / KB as f64)
194+
} else {
195+
format!("{} B", bytes)
196+
}
197+
}
198+
199+
/// Compute total size of all files in a directory (non-recursive is fine for HNSW).
200+
fn dir_size(path: &Path) -> u64 {
201+
if !path.exists() {
202+
return 0;
203+
}
204+
let mut total = 0u64;
205+
if let Ok(entries) = std::fs::read_dir(path) {
206+
for entry in entries.flatten() {
207+
if let Ok(meta) = entry.metadata() {
208+
if meta.is_file() {
209+
total += meta.len();
210+
}
211+
}
212+
}
213+
}
214+
total
215+
}
216+
217+
#[cfg(test)]
218+
mod tests {
219+
use super::*;
220+
221+
#[test]
222+
fn test_format_human_result() {
223+
let results = vec![SearchResult {
224+
score: 0.87,
225+
file_path: "foo.md".to_string(),
226+
heading: Some("## Bar".to_string()),
227+
snippet: "Some text...".to_string(),
228+
}];
229+
let output = format_results(&results, false);
230+
assert_eq!(output, " 1. [0.87] foo.md > ## Bar\n Some text...\n");
231+
}
232+
233+
#[test]
234+
fn test_format_json_result() {
235+
let results = vec![SearchResult {
236+
score: 0.87,
237+
file_path: "foo.md".to_string(),
238+
heading: Some("## Bar".to_string()),
239+
snippet: "Some text...".to_string(),
240+
}];
241+
let output = format_results(&results, true);
242+
let parsed: Vec<serde_json::Value> = serde_json::from_str(&output).unwrap();
243+
assert_eq!(parsed.len(), 1);
244+
assert_eq!(parsed[0]["rank"], 1);
245+
assert_eq!(parsed[0]["score"], 0.87);
246+
assert_eq!(parsed[0]["file"], "foo.md");
247+
assert_eq!(parsed[0]["heading"], "## Bar");
248+
assert_eq!(parsed[0]["snippet"], "Some text...");
249+
}
250+
251+
#[test]
252+
fn test_no_results_message() {
253+
let output = format_results(&[], false);
254+
assert_eq!(output, "No results found.\n");
255+
256+
let json_output = format_results(&[], true);
257+
assert_eq!(json_output, "No results found.\n");
258+
}
259+
260+
#[test]
261+
fn test_format_status_human() {
262+
let stats = StoreStats {
263+
file_count: 42,
264+
chunk_count: 187,
265+
tombstone_count: 3,
266+
last_indexed_at: Some("2026-03-19 14:30:00".to_string()),
267+
vault_path: Some("/path/to/vault".to_string()),
268+
};
269+
let output = format_status(&stats, 2_516_582, "all-MiniLM-L6-v2", false);
270+
271+
assert!(output.contains("/path/to/vault"), "missing vault path");
272+
assert!(output.contains("42"), "missing file count");
273+
assert!(output.contains("187"), "missing chunk count");
274+
assert!(output.contains("3"), "missing tombstone count");
275+
assert!(output.contains("2026-03-19 14:30:00"), "missing last index");
276+
assert!(output.contains("2.4 MB"), "missing index size");
277+
assert!(output.contains("all-MiniLM-L6-v2"), "missing model");
278+
}
279+
280+
#[test]
281+
fn test_format_status_json() {
282+
let stats = StoreStats {
283+
file_count: 42,
284+
chunk_count: 187,
285+
tombstone_count: 3,
286+
last_indexed_at: Some("2026-03-19 14:30:00".to_string()),
287+
vault_path: Some("/path/to/vault".to_string()),
288+
};
289+
let output = format_status(&stats, 2_516_582, "all-MiniLM-L6-v2", true);
290+
let parsed: serde_json::Value = serde_json::from_str(&output).unwrap();
291+
292+
assert_eq!(parsed["vault"], "/path/to/vault");
293+
assert_eq!(parsed["files"], 42);
294+
assert_eq!(parsed["chunks"], 187);
295+
assert_eq!(parsed["tombstones"], 3);
296+
assert_eq!(parsed["last_indexed"], "2026-03-19 14:30:00");
297+
assert_eq!(parsed["index_size"], 2_516_582);
298+
assert_eq!(parsed["model"], "all-MiniLM-L6-v2");
299+
}
300+
301+
#[test]
302+
fn test_truncate_snippet() {
303+
let short = "hello";
304+
assert_eq!(truncate_snippet(short, 200), "hello");
305+
306+
let long = "a".repeat(300);
307+
let truncated = truncate_snippet(&long, 200);
308+
assert!(truncated.ends_with("..."));
309+
assert_eq!(truncated.len(), 203); // 200 + "..."
310+
}
311+
312+
#[test]
313+
fn test_format_bytes() {
314+
assert_eq!(format_bytes(0), "0 B");
315+
assert_eq!(format_bytes(512), "512 B");
316+
assert_eq!(format_bytes(1024), "1.0 KB");
317+
assert_eq!(format_bytes(1024 * 1024), "1.0 MB");
318+
assert_eq!(format_bytes(2_516_582), "2.4 MB");
319+
}
320+
}

src/store.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,18 @@ impl Store {
301301
})
302302
}
303303

304+
/// Look up a file's path by its row ID.
305+
pub fn get_file_path_by_id(&self, file_id: i64) -> Result<Option<String>> {
306+
let mut stmt = self
307+
.conn
308+
.prepare("SELECT path FROM files WHERE id = ?1")?;
309+
let mut rows = stmt.query_map(params![file_id], |row| row.get::<_, String>(0))?;
310+
match rows.next() {
311+
Some(val) => Ok(Some(val?)),
312+
None => Ok(None),
313+
}
314+
}
315+
304316
/// Return vector_ids for all chunks belonging to a file.
305317
/// Useful for tombstoning before re-indexing a changed file.
306318
pub fn get_vector_ids_for_file(&self, file_id: i64) -> Result<Vec<u64>> {

0 commit comments

Comments
 (0)