Skip to content

Commit 727245f

Browse files
devwhodevsclaude
andcommitted
fix: UTF-8 char boundary panics in smart chunker + schema migration order
Two bugs found during real vault testing: 1. Smart chunker panicked on multi-byte UTF-8 chars (em dash, etc.) when byte offsets from break-point scoring landed inside multi-byte sequences. Fixed by snapping all byte offsets to valid char boundaries before slicing. 2. Schema migration failed on existing v0.1 databases: the SCHEMA constant tried to CREATE INDEX on docid column before migration added it. Moved index creation into the migration path so it runs after the column exists. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b738b6f commit 727245f

2 files changed

Lines changed: 23 additions & 4 deletions

File tree

src/chunker.rs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,16 @@ fn approx_tokens(text: &str) -> usize {
163163
text.len().div_ceil(4)
164164
}
165165

166+
/// Snap a byte offset to the nearest valid UTF-8 char boundary (forward).
167+
fn snap_to_char_boundary(s: &str, offset: usize) -> usize {
168+
let offset = offset.min(s.len());
169+
let mut pos = offset;
170+
while pos < s.len() && !s.is_char_boundary(pos) {
171+
pos += 1;
172+
}
173+
pos
174+
}
175+
166176
/// Extract the first heading line from text (any `#` level).
167177
fn extract_heading(text: &str) -> Option<String> {
168178
for line in text.lines() {
@@ -205,6 +215,10 @@ pub fn smart_chunk(content: &str, target_tokens: usize, overlap_pct: usize) -> V
205215
let mut start_offset = 0;
206216

207217
while start_offset < content.len() {
218+
start_offset = snap_to_char_boundary(content, start_offset);
219+
if start_offset >= content.len() {
220+
break;
221+
}
208222
let remaining = &content[start_offset..];
209223
if remaining.trim().is_empty() {
210224
break;
@@ -252,7 +266,10 @@ pub fn smart_chunk(content: &str, target_tokens: usize, overlap_pct: usize) -> V
252266
Some(bp) => bp.byte_offset,
253267
None => {
254268
// No good break point found; cut at target
255-
let cut = (start_offset + target_chars).min(content.len());
269+
let cut = snap_to_char_boundary(
270+
content,
271+
(start_offset + target_chars).min(content.len()),
272+
);
256273
// Try to find a newline near the cut
257274
let fallback = if let Some(nl) = content[start_offset..cut.min(content.len())]
258275
.rfind('\n')
@@ -267,6 +284,7 @@ pub fn smart_chunk(content: &str, target_tokens: usize, overlap_pct: usize) -> V
267284
}
268285
};
269286

287+
let cut_offset = snap_to_char_boundary(content, cut_offset);
270288
let chunk_text = content[start_offset..cut_offset].trim().to_string();
271289
if !chunk_text.is_empty() {
272290
let heading = extract_heading(&chunk_text);

src/store.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,6 @@ CREATE TABLE IF NOT EXISTS files (
6363
docid TEXT
6464
);
6565
66-
CREATE INDEX IF NOT EXISTS idx_files_docid ON files(docid);
67-
6866
CREATE TABLE IF NOT EXISTS chunks (
6967
id INTEGER PRIMARY KEY,
7068
file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
@@ -130,8 +128,11 @@ impl Store {
130128
};
131129
if !has_docid {
132130
self.conn
133-
.execute_batch("ALTER TABLE files ADD COLUMN docid TEXT; CREATE INDEX IF NOT EXISTS idx_files_docid ON files(docid);")?;
131+
.execute_batch("ALTER TABLE files ADD COLUMN docid TEXT;")?;
134132
}
133+
// Always ensure the index exists (safe for both fresh and migrated DBs).
134+
self.conn
135+
.execute_batch("CREATE INDEX IF NOT EXISTS idx_files_docid ON files(docid);")?;
135136
Ok(())
136137
}
137138

0 commit comments

Comments
 (0)