devwhodevs
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 22 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎.github/workflows/release.yml‎
Lines changed: 37 additions & 0 deletions b/‎.github/workflows/release.yml‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎LICENSE‎
Lines changed: 21 additions & 0 deletions b/‎LICENSE‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 83 additions & 0 deletions b/‎README.md‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎src/chunker.rs‎
Lines changed: 21 additions & 8 deletions b/‎src/chunker.rs‎
Lines changed: 21 additions & 8 deletions
diff --git a/‎src/embedder.rs‎
Lines changed: 15 additions & 10 deletions b/‎src/embedder.rs‎
Lines changed: 15 additions & 10 deletions
diff --git a/‎src/hnsw.rs‎
Lines changed: 6 additions & 11 deletions b/‎src/hnsw.rs‎
Lines changed: 6 additions & 11 deletions
@@ -0,0 +1,22 @@
+name: CI
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  check:
+    strategy:
+      matrix:
+        os: [macos-latest, ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@stable
+        with:
+          components: rustfmt, clippy
+      - run: cargo fmt --check
+      - run: cargo clippy -- -D warnings
+      - run: cargo test --lib
@@ -0,0 +1,37 @@
+name: Release
+
+on:
+  push:
+    tags: ["v*"]
+
+jobs:
+  build:
+    strategy:
+      matrix:
+        include:
+          - target: aarch64-apple-darwin
+            os: macos-14
+            archive: engraph-macos-arm64.tar.gz
+          - target: x86_64-apple-darwin
+            os: macos-13
+            archive: engraph-macos-x86_64.tar.gz
+          - target: x86_64-unknown-linux-gnu
+            os: ubuntu-latest
+            archive: engraph-linux-x86_64.tar.gz
+    runs-on: ${{ matrix.os }}
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@stable
+        with:
+          targets: ${{ matrix.target }}
+      - run: cargo build --release --target ${{ matrix.target }}
+      - name: Archive binary
+        run: |
+          cd target/${{ matrix.target }}/release
+          tar czf ../../../${{ matrix.archive }} engraph
+          cd ../../..
+      - uses: softprops/action-gh-release@v2
+        with:
+          files: ${{ matrix.archive }}
@@ -1 +1,4 @@
 /target
+*.swp
+*.swo
+.DS_Store
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 engraph contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,83 @@
+# engraph
+
+Local semantic search for Obsidian vaults.
+
+## Install
+
+**Homebrew (macOS):**
+
+```bash
+brew install devwhodevs/tap/engraph
+```
+
+**Cargo:**
+
+```bash
+cargo install engraph
+```
+
+**Binary download:**
+
+Pre-built binaries for macOS (arm64/x86_64) and Linux (x86_64) are available on the [Releases](https://github.com/devwhodevs/engraph/releases) page.
+
+## Quick start
+
+```bash
+engraph index ~/vault
+engraph search "query"
+```
+
+## Commands
+
+| Command  | Description                        | Flags                          |
+|----------|------------------------------------|--------------------------------|
+| `index`  | Index a vault for semantic search  | `[path]`, `--rebuild`          |
+| `search` | Search the indexed vault           | `<query>`, `-n/--top-n <N>`   |
+| `status` | Show index status and statistics   |                                |
+| `clear`  | Clear cached data                  | `--all`                        |
+
+## Configuration
+
+engraph reads `~/.config/engraph/config.toml`:
+
+```toml
+vault_path = "~/Documents/vault"
+top_n = 5
+exclude = [".obsidian/*", ".trash/*"]
+batch_size = 64
+```
+
+| Key          | Description                                  | Default                          |
+|--------------|----------------------------------------------|----------------------------------|
+| `vault_path` | Path to Obsidian vault                       | None (must specify via CLI/config) |
+| `top_n`      | Number of search results to return           | `5`                              |
+| `exclude`    | Glob patterns to exclude from indexing       | `[".obsidian/*", ".trash/*"]`    |
+| `batch_size` | Files per embedding batch                    | `64`                             |
+
+## How it works
+
+engraph splits your vault's markdown files into heading-based chunks, generates embeddings locally using an ONNX runtime model (all-MiniLM-L6-v2), and stores them in an HNSW index for fast approximate nearest-neighbor search. Everything runs on your machine -- no API keys, no network calls after the initial one-time model download.
+
+The indexing pipeline:
+
+1. Walk the vault, respecting `.gitignore` and exclude patterns
+2. Split each markdown file into chunks by heading boundaries
+3. Sub-split oversized chunks to stay within the model's token limit
+4. Embed chunks in batches via ONNX Runtime
+5. Insert embeddings into an HNSW graph stored alongside a SQLite metadata database
+
+Subsequent runs are incremental -- only new or modified files are re-processed.
+
+## Contributing
+
+Contributions are welcome. Please open an issue to discuss larger changes before submitting a PR.
+
+```bash
+cargo fmt
+cargo clippy -- -D warnings
+cargo test --lib
+```
+
+## License
+
+MIT
@@ -211,7 +211,7 @@ fn parse_frontmatter(content: &str) -> (Vec<String>, &str) {
 
     // Find the closing ---
     let after_first = &trimmed[3..];
-    let after_first = after_first.trim_start_matches(|c: char| c == '-'); // handle "----"
+    let after_first = after_first.trim_start_matches('-'); // handle "----"
     let after_first = after_first.strip_prefix('\n').unwrap_or(after_first);
 
     if let Some(end_pos) = after_first.find("\n---") {
@@ -238,9 +238,7 @@ fn parse_tags_from_yaml(yaml: &str) -> Vec<String> {
             let after_colon = trimmed.strip_prefix("tags:").unwrap().trim();
             // Inline list: tags: [a, b]
             if after_colon.starts_with('[') {
-                let inner = after_colon
-                    .trim_start_matches('[')
-                    .trim_end_matches(']');
+                let inner = after_colon.trim_start_matches('[').trim_end_matches(']');
                 return inner
                     .split(',')
                     .map(|s| s.trim().to_string())
@@ -341,11 +339,19 @@ mod tests {
     fn test_long_chunk_split() {
         // Generate ~600 words of text with sentence boundaries
         let sentences: Vec<String> = (0..60)
-            .map(|i| format!("This is sentence number {} with several words to pad it out.", i))
+            .map(|i| {
+                format!(
+                    "This is sentence number {} with several words to pad it out.",
+                    i
+                )
+            })
             .collect();
         let long_text = sentences.join(" ");
         let word_count = long_text.split_whitespace().count();
-        assert!(word_count > 512, "Test text must exceed 512 tokens (words); got {word_count}");
+        assert!(
+            word_count > 512,
+            "Test text must exceed 512 tokens (words); got {word_count}"
+        );
 
         let chunk = Chunk {
             heading: Some("## Long Section".to_string()),
@@ -356,11 +362,18 @@ mod tests {
         let token_fn = |s: &str| s.split_whitespace().count();
         let result = split_oversized_chunks(vec![chunk], &token_fn, 512, 50);
 
-        assert!(result.len() >= 2, "Expected at least 2 sub-chunks, got {}", result.len());
+        assert!(
+            result.len() >= 2,
+            "Expected at least 2 sub-chunks, got {}",
+            result.len()
+        );
         // First chunk keeps original heading
         assert_eq!(result[0].heading.as_deref(), Some("## Long Section"));
         // Subsequent chunks get (cont.)
-        assert_eq!(result[1].heading.as_deref(), Some("## Long Section (cont.)"));
+        assert_eq!(
+            result[1].heading.as_deref(),
+            Some("## Long Section (cont.)")
+        );
         // All sub-chunks should be within token limit
         for c in &result {
             let tokens = token_fn(&c.text);
 
@@ -1,7 +1,7 @@
 use std::io::Read;
 use std::path::Path;
 
-use anyhow::{bail, Context, Result};
+use anyhow::{Context, Result, bail};
 use indicatif::{ProgressBar, ProgressStyle};
 use ndarray::Array2;
 use ort::session::Session;
@@ -27,6 +27,7 @@ pub struct Embedder {
 impl Embedder {
     /// Create a new Embedder, downloading the model and tokenizer into
     /// `models_dir` if they are not already present.
+    #[allow(clippy::const_is_empty)] // MODEL_SHA256 is empty until a known hash is pinned.
     pub fn new(models_dir: &Path) -> Result<Self> {
         std::fs::create_dir_all(models_dir)
             .with_context(|| format!("creating models dir {}", models_dir.display()))?;
@@ -36,7 +37,7 @@ impl Embedder {
 
         // Download model if missing.
         if !model_path.exists() {
-            let expected_sha = if MODEL_SHA256.is_empty() {
+            let expected_sha: Option<&str> = if MODEL_SHA256.is_empty() {
                 None
             } else {
                 Some(MODEL_SHA256)
@@ -79,7 +80,11 @@ impl Embedder {
             .map_err(|e| anyhow::anyhow!("tokenization failed: {e}"))?;
 
         let batch_size = encodings.len();
-        let max_len = encodings.iter().map(|e| e.get_ids().len()).max().unwrap_or(0);
+        let max_len = encodings
+            .iter()
+            .map(|e| e.get_ids().len())
+            .max()
+            .unwrap_or(0);
 
         // Build padded input arrays.
         let mut input_ids_vec = vec![0i64; batch_size * max_len];
@@ -99,12 +104,9 @@ impl Embedder {
             }
         }
 
-        let input_ids =
-            Array2::from_shape_vec((batch_size, max_len), input_ids_vec)?;
-        let attention_mask =
-            Array2::from_shape_vec((batch_size, max_len), attention_mask_vec)?;
-        let token_type_ids =
-            Array2::from_shape_vec((batch_size, max_len), token_type_ids_vec)?;
+        let input_ids = Array2::from_shape_vec((batch_size, max_len), input_ids_vec)?;
+        let attention_mask = Array2::from_shape_vec((batch_size, max_len), attention_mask_vec)?;
+        let token_type_ids = Array2::from_shape_vec((batch_size, max_len), token_type_ids_vec)?;
 
         let input_ids_tensor = Tensor::from_array(input_ids)?;
         let attention_mask_tensor = Tensor::from_array(attention_mask.clone())?;
@@ -200,6 +202,7 @@ fn verify_sha256(path: &Path, expected: &str) -> Result<()> {
 }
 
 /// Compute SHA-256 hex digest of a byte slice.
+#[cfg(test)]
 fn sha256_bytes(data: &[u8]) -> String {
     let mut hasher = Sha256::new();
     hasher.update(data);
@@ -212,7 +215,9 @@ fn download_file(url: &str, dest: &Path, expected_sha256: Option<&str>) -> Resul
     fn try_download(url: &str, dest: &Path, expected_sha256: Option<&str>) -> Result<()> {
         info!("downloading {} -> {}", url, dest.display());
 
-        let resp = ureq::get(url).call().with_context(|| format!("HTTP GET {url}"))?;
+        let resp = ureq::get(url)
+            .call()
+            .with_context(|| format!("HTTP GET {url}"))?;
 
         let total_size: u64 = resp
             .header("Content-Length")
 
@@ -73,12 +73,7 @@ impl HnswIndex {
     /// Returns `(vector_id, score)` pairs sorted by ascending distance,
     /// excluding any IDs in `tombstones`. Requests `k * 2` results from
     /// the underlying index for tombstone headroom.
-    pub fn search(
-        &self,
-        query: &[f32],
-        k: usize,
-        tombstones: &HashSet<u64>,
-    ) -> Vec<(u64, f32)> {
+    pub fn search(&self, query: &[f32], k: usize, tombstones: &HashSet<u64>) -> Vec<(u64, f32)> {
         if self.inner.get_nb_point() == 0 {
             return Vec::new();
         }
@@ -158,10 +153,7 @@ mod tests {
 
         let results = index.search(&vectors[0], 5, &tombstones);
         for (id, _score) in &results {
-            assert_ne!(
-                *id, ids[0],
-                "tombstoned ID should not appear in results"
-            );
+            assert_ne!(*id, ids[0], "tombstoned ID should not appear in results");
         }
     }
 
@@ -180,7 +172,10 @@ mod tests {
         // Load and search.
         let index = HnswIndex::load(tmpdir.path()).unwrap();
         let results = index.search(&vectors[0], 3, &HashSet::new());
-        assert!(!results.is_empty(), "search after reload returned no results");
+        assert!(
+            !results.is_empty(),
+            "search after reload returned no results"
+        );
         assert_eq!(
             results[0].0, 0,
             "expected vector 0 to be the top result after reload"
-Original file line number
+Diff line change
@@ @@ -1 +1,4 @@ @@
 /target
 +*.swp
 +*.swo
 +.DS_Store