Skip to content

Commit 8e0764c

Browse files
devwhodevsclaude
andcommitted
chore: LICENSE, README, CI/CD workflows, gitignore
Add MIT license, project README with install/usage docs, GitHub Actions CI (fmt/clippy/test on macOS+Linux) and release workflows (cross-platform binary builds). Fix clippy warnings and apply cargo fmt. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 369926c commit 8e0764c

13 files changed

Lines changed: 285 additions & 107 deletions

File tree

.github/workflows/ci.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: [main]
8+
9+
jobs:
10+
check:
11+
strategy:
12+
matrix:
13+
os: [macos-latest, ubuntu-latest]
14+
runs-on: ${{ matrix.os }}
15+
steps:
16+
- uses: actions/checkout@v4
17+
- uses: dtolnay/rust-toolchain@stable
18+
with:
19+
components: rustfmt, clippy
20+
- run: cargo fmt --check
21+
- run: cargo clippy -- -D warnings
22+
- run: cargo test --lib

.github/workflows/release.yml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: Release
2+
3+
on:
4+
push:
5+
tags: ["v*"]
6+
7+
jobs:
8+
build:
9+
strategy:
10+
matrix:
11+
include:
12+
- target: aarch64-apple-darwin
13+
os: macos-14
14+
archive: engraph-macos-arm64.tar.gz
15+
- target: x86_64-apple-darwin
16+
os: macos-13
17+
archive: engraph-macos-x86_64.tar.gz
18+
- target: x86_64-unknown-linux-gnu
19+
os: ubuntu-latest
20+
archive: engraph-linux-x86_64.tar.gz
21+
runs-on: ${{ matrix.os }}
22+
permissions:
23+
contents: write
24+
steps:
25+
- uses: actions/checkout@v4
26+
- uses: dtolnay/rust-toolchain@stable
27+
with:
28+
targets: ${{ matrix.target }}
29+
- run: cargo build --release --target ${{ matrix.target }}
30+
- name: Archive binary
31+
run: |
32+
cd target/${{ matrix.target }}/release
33+
tar czf ../../../${{ matrix.archive }} engraph
34+
cd ../../..
35+
- uses: softprops/action-gh-release@v2
36+
with:
37+
files: ${{ matrix.archive }}

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
11
/target
2+
*.swp
3+
*.swo
4+
.DS_Store

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2026 engraph contributors
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# engraph
2+
3+
Local semantic search for Obsidian vaults.
4+
5+
## Install
6+
7+
**Homebrew (macOS):**
8+
9+
```bash
10+
brew install devwhodevs/tap/engraph
11+
```
12+
13+
**Cargo:**
14+
15+
```bash
16+
cargo install engraph
17+
```
18+
19+
**Binary download:**
20+
21+
Pre-built binaries for macOS (arm64/x86_64) and Linux (x86_64) are available on the [Releases](https://github.com/devwhodevs/engraph/releases) page.
22+
23+
## Quick start
24+
25+
```bash
26+
engraph index ~/vault
27+
engraph search "query"
28+
```
29+
30+
## Commands
31+
32+
| Command | Description | Flags |
33+
|----------|------------------------------------|--------------------------------|
34+
| `index` | Index a vault for semantic search | `[path]`, `--rebuild` |
35+
| `search` | Search the indexed vault | `<query>`, `-n/--top-n <N>` |
36+
| `status` | Show index status and statistics | |
37+
| `clear` | Clear cached data | `--all` |
38+
39+
## Configuration
40+
41+
engraph reads `~/.config/engraph/config.toml`:
42+
43+
```toml
44+
vault_path = "~/Documents/vault"
45+
top_n = 5
46+
exclude = [".obsidian/*", ".trash/*"]
47+
batch_size = 64
48+
```
49+
50+
| Key | Description | Default |
51+
|--------------|----------------------------------------------|----------------------------------|
52+
| `vault_path` | Path to Obsidian vault | None (must specify via CLI/config) |
53+
| `top_n` | Number of search results to return | `5` |
54+
| `exclude` | Glob patterns to exclude from indexing | `[".obsidian/*", ".trash/*"]` |
55+
| `batch_size` | Files per embedding batch | `64` |
56+
57+
## How it works
58+
59+
engraph splits your vault's markdown files into heading-based chunks, generates embeddings locally using an ONNX runtime model (all-MiniLM-L6-v2), and stores them in an HNSW index for fast approximate nearest-neighbor search. Everything runs on your machine -- no API keys, no network calls after the initial one-time model download.
60+
61+
The indexing pipeline:
62+
63+
1. Walk the vault, respecting `.gitignore` and exclude patterns
64+
2. Split each markdown file into chunks by heading boundaries
65+
3. Sub-split oversized chunks to stay within the model's token limit
66+
4. Embed chunks in batches via ONNX Runtime
67+
5. Insert embeddings into an HNSW graph stored alongside a SQLite metadata database
68+
69+
Subsequent runs are incremental -- only new or modified files are re-processed.
70+
71+
## Contributing
72+
73+
Contributions are welcome. Please open an issue to discuss larger changes before submitting a PR.
74+
75+
```bash
76+
cargo fmt
77+
cargo clippy -- -D warnings
78+
cargo test --lib
79+
```
80+
81+
## License
82+
83+
MIT

src/chunker.rs

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ fn parse_frontmatter(content: &str) -> (Vec<String>, &str) {
211211

212212
// Find the closing ---
213213
let after_first = &trimmed[3..];
214-
let after_first = after_first.trim_start_matches(|c: char| c == '-'); // handle "----"
214+
let after_first = after_first.trim_start_matches('-'); // handle "----"
215215
let after_first = after_first.strip_prefix('\n').unwrap_or(after_first);
216216

217217
if let Some(end_pos) = after_first.find("\n---") {
@@ -238,9 +238,7 @@ fn parse_tags_from_yaml(yaml: &str) -> Vec<String> {
238238
let after_colon = trimmed.strip_prefix("tags:").unwrap().trim();
239239
// Inline list: tags: [a, b]
240240
if after_colon.starts_with('[') {
241-
let inner = after_colon
242-
.trim_start_matches('[')
243-
.trim_end_matches(']');
241+
let inner = after_colon.trim_start_matches('[').trim_end_matches(']');
244242
return inner
245243
.split(',')
246244
.map(|s| s.trim().to_string())
@@ -341,11 +339,19 @@ mod tests {
341339
fn test_long_chunk_split() {
342340
// Generate ~600 words of text with sentence boundaries
343341
let sentences: Vec<String> = (0..60)
344-
.map(|i| format!("This is sentence number {} with several words to pad it out.", i))
342+
.map(|i| {
343+
format!(
344+
"This is sentence number {} with several words to pad it out.",
345+
i
346+
)
347+
})
345348
.collect();
346349
let long_text = sentences.join(" ");
347350
let word_count = long_text.split_whitespace().count();
348-
assert!(word_count > 512, "Test text must exceed 512 tokens (words); got {word_count}");
351+
assert!(
352+
word_count > 512,
353+
"Test text must exceed 512 tokens (words); got {word_count}"
354+
);
349355

350356
let chunk = Chunk {
351357
heading: Some("## Long Section".to_string()),
@@ -356,11 +362,18 @@ mod tests {
356362
let token_fn = |s: &str| s.split_whitespace().count();
357363
let result = split_oversized_chunks(vec![chunk], &token_fn, 512, 50);
358364

359-
assert!(result.len() >= 2, "Expected at least 2 sub-chunks, got {}", result.len());
365+
assert!(
366+
result.len() >= 2,
367+
"Expected at least 2 sub-chunks, got {}",
368+
result.len()
369+
);
360370
// First chunk keeps original heading
361371
assert_eq!(result[0].heading.as_deref(), Some("## Long Section"));
362372
// Subsequent chunks get (cont.)
363-
assert_eq!(result[1].heading.as_deref(), Some("## Long Section (cont.)"));
373+
assert_eq!(
374+
result[1].heading.as_deref(),
375+
Some("## Long Section (cont.)")
376+
);
364377
// All sub-chunks should be within token limit
365378
for c in &result {
366379
let tokens = token_fn(&c.text);

src/embedder.rs

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use std::io::Read;
22
use std::path::Path;
33

4-
use anyhow::{bail, Context, Result};
4+
use anyhow::{Context, Result, bail};
55
use indicatif::{ProgressBar, ProgressStyle};
66
use ndarray::Array2;
77
use ort::session::Session;
@@ -27,6 +27,7 @@ pub struct Embedder {
2727
impl Embedder {
2828
/// Create a new Embedder, downloading the model and tokenizer into
2929
/// `models_dir` if they are not already present.
30+
#[allow(clippy::const_is_empty)] // MODEL_SHA256 is empty until a known hash is pinned.
3031
pub fn new(models_dir: &Path) -> Result<Self> {
3132
std::fs::create_dir_all(models_dir)
3233
.with_context(|| format!("creating models dir {}", models_dir.display()))?;
@@ -36,7 +37,7 @@ impl Embedder {
3637

3738
// Download model if missing.
3839
if !model_path.exists() {
39-
let expected_sha = if MODEL_SHA256.is_empty() {
40+
let expected_sha: Option<&str> = if MODEL_SHA256.is_empty() {
4041
None
4142
} else {
4243
Some(MODEL_SHA256)
@@ -79,7 +80,11 @@ impl Embedder {
7980
.map_err(|e| anyhow::anyhow!("tokenization failed: {e}"))?;
8081

8182
let batch_size = encodings.len();
82-
let max_len = encodings.iter().map(|e| e.get_ids().len()).max().unwrap_or(0);
83+
let max_len = encodings
84+
.iter()
85+
.map(|e| e.get_ids().len())
86+
.max()
87+
.unwrap_or(0);
8388

8489
// Build padded input arrays.
8590
let mut input_ids_vec = vec![0i64; batch_size * max_len];
@@ -99,12 +104,9 @@ impl Embedder {
99104
}
100105
}
101106

102-
let input_ids =
103-
Array2::from_shape_vec((batch_size, max_len), input_ids_vec)?;
104-
let attention_mask =
105-
Array2::from_shape_vec((batch_size, max_len), attention_mask_vec)?;
106-
let token_type_ids =
107-
Array2::from_shape_vec((batch_size, max_len), token_type_ids_vec)?;
107+
let input_ids = Array2::from_shape_vec((batch_size, max_len), input_ids_vec)?;
108+
let attention_mask = Array2::from_shape_vec((batch_size, max_len), attention_mask_vec)?;
109+
let token_type_ids = Array2::from_shape_vec((batch_size, max_len), token_type_ids_vec)?;
108110

109111
let input_ids_tensor = Tensor::from_array(input_ids)?;
110112
let attention_mask_tensor = Tensor::from_array(attention_mask.clone())?;
@@ -200,6 +202,7 @@ fn verify_sha256(path: &Path, expected: &str) -> Result<()> {
200202
}
201203

202204
/// Compute SHA-256 hex digest of a byte slice.
205+
#[cfg(test)]
203206
fn sha256_bytes(data: &[u8]) -> String {
204207
let mut hasher = Sha256::new();
205208
hasher.update(data);
@@ -212,7 +215,9 @@ fn download_file(url: &str, dest: &Path, expected_sha256: Option<&str>) -> Resul
212215
fn try_download(url: &str, dest: &Path, expected_sha256: Option<&str>) -> Result<()> {
213216
info!("downloading {} -> {}", url, dest.display());
214217

215-
let resp = ureq::get(url).call().with_context(|| format!("HTTP GET {url}"))?;
218+
let resp = ureq::get(url)
219+
.call()
220+
.with_context(|| format!("HTTP GET {url}"))?;
216221

217222
let total_size: u64 = resp
218223
.header("Content-Length")

src/hnsw.rs

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,7 @@ impl HnswIndex {
7373
/// Returns `(vector_id, score)` pairs sorted by ascending distance,
7474
/// excluding any IDs in `tombstones`. Requests `k * 2` results from
7575
/// the underlying index for tombstone headroom.
76-
pub fn search(
77-
&self,
78-
query: &[f32],
79-
k: usize,
80-
tombstones: &HashSet<u64>,
81-
) -> Vec<(u64, f32)> {
76+
pub fn search(&self, query: &[f32], k: usize, tombstones: &HashSet<u64>) -> Vec<(u64, f32)> {
8277
if self.inner.get_nb_point() == 0 {
8378
return Vec::new();
8479
}
@@ -158,10 +153,7 @@ mod tests {
158153

159154
let results = index.search(&vectors[0], 5, &tombstones);
160155
for (id, _score) in &results {
161-
assert_ne!(
162-
*id, ids[0],
163-
"tombstoned ID should not appear in results"
164-
);
156+
assert_ne!(*id, ids[0], "tombstoned ID should not appear in results");
165157
}
166158
}
167159

@@ -180,7 +172,10 @@ mod tests {
180172
// Load and search.
181173
let index = HnswIndex::load(tmpdir.path()).unwrap();
182174
let results = index.search(&vectors[0], 3, &HashSet::new());
183-
assert!(!results.is_empty(), "search after reload returned no results");
175+
assert!(
176+
!results.is_empty(),
177+
"search after reload returned no results"
178+
);
184179
assert_eq!(
185180
results[0].0, 0,
186181
"expected vector 0 to be the top result after reload"

0 commit comments

Comments
 (0)