Skip to content

Commit 94c3e68

Browse files
committed
feat(migrate): add heuristic PARA classification engine
1 parent a4b629b commit 94c3e68

4 files changed

Lines changed: 338 additions & 0 deletions

File tree

Cargo.lock

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ shimmytok = "0.7"
4141
axum = "0.8"
4242
tower-http = { version = "0.6", features = ["cors"] }
4343
tower = "0.5"
44+
uuid = { version = "1", features = ["v4"] }
4445
rand = "0.9"
4546
tokio-util = "0.7"
4647

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ pub mod indexer;
1111
pub mod links;
1212
pub mod llm;
1313
pub mod markdown;
14+
pub mod migrate;
1415
pub mod obsidian;
1516
pub mod placement;
1617
pub mod profile;

src/migrate.rs

Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
//! Heuristic PARA classification engine for vault migration.
2+
//!
3+
//! Classifies notes into PARA categories (Project, Area, Resource, Archive)
4+
//! using priority-ordered heuristic rules. No LLM required.
5+
6+
use serde::{Deserialize, Serialize};
7+
8+
// ── Core types ─────────────────────────────────────────────────
9+
10+
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
11+
pub enum Category {
12+
Project,
13+
Area,
14+
Resource,
15+
Archive,
16+
Skip,
17+
Uncertain,
18+
}
19+
20+
#[derive(Debug, Clone, Serialize, Deserialize)]
21+
pub struct Classification {
22+
pub category: Category,
23+
pub confidence: f64,
24+
pub signal: String,
25+
pub suggested_path: Option<String>,
26+
}
27+
28+
#[derive(Debug, Clone, Serialize, Deserialize)]
29+
pub struct FileClassification {
30+
pub path: String,
31+
pub classification: Classification,
32+
}
33+
34+
#[derive(Debug, Clone, Serialize, Deserialize)]
35+
pub struct MigrationPreview {
36+
pub migration_id: String,
37+
pub files: Vec<FileClassification>,
38+
pub uncertain: Vec<FileClassification>,
39+
pub skipped: usize,
40+
}
41+
42+
#[derive(Debug, Serialize)]
43+
pub struct MigrationResult {
44+
pub migration_id: String,
45+
pub moved: usize,
46+
pub skipped: usize,
47+
pub errors: Vec<String>,
48+
}
49+
50+
#[derive(Debug, Serialize)]
51+
pub struct UndoResult {
52+
pub migration_id: String,
53+
pub restored: usize,
54+
pub errors: Vec<String>,
55+
}
56+
57+
// ── Heuristic classifier ───────────────────────────────────────
58+
59+
/// Classify a note using heuristic rules only (no LLM).
60+
/// Rules run in priority order — first match wins.
61+
///
62+
/// Parameters:
63+
/// - content: full note content
64+
/// - filename: relative path (e.g., "07-Daily/2026-03-26.md")
65+
/// - frontmatter_str: raw frontmatter YAML (without --- delimiters), or None
66+
/// - edge_count: incoming + outgoing edges from the store
67+
/// - has_recent_mentions: whether the note was mentioned in notes from the last 30 days
68+
pub fn classify_heuristic(
69+
content: &str,
70+
filename: &str,
71+
frontmatter_str: Option<&str>,
72+
edge_count: usize,
73+
has_recent_mentions: bool,
74+
) -> Classification {
75+
// Extract basename (without extension) for pattern matching
76+
let basename = std::path::Path::new(filename)
77+
.file_stem()
78+
.and_then(|s| s.to_str())
79+
.unwrap_or("");
80+
81+
// Rule 1: Daily note — basename matches YYYY-MM-DD pattern
82+
if is_daily_note(basename) {
83+
return Classification {
84+
category: Category::Skip,
85+
confidence: 1.0,
86+
signal: "daily note filename pattern".into(),
87+
suggested_path: None,
88+
};
89+
}
90+
91+
// Rule 2: Template — path contains "template" (case-insensitive)
92+
if filename.to_lowercase().contains("template") {
93+
return Classification {
94+
category: Category::Skip,
95+
confidence: 1.0,
96+
signal: "template path".into(),
97+
suggested_path: None,
98+
};
99+
}
100+
101+
// Rule 3: Canvas — filename ends with .canvas
102+
if filename.ends_with(".canvas") {
103+
return Classification {
104+
category: Category::Skip,
105+
confidence: 1.0,
106+
signal: "canvas file".into(),
107+
suggested_path: None,
108+
};
109+
}
110+
111+
let fm = frontmatter_str.unwrap_or("");
112+
113+
// Rule 4: Status active/in-progress → Project (90%)
114+
if fm.contains("status: active") || fm.contains("status: in-progress") {
115+
return Classification {
116+
category: Category::Project,
117+
confidence: 0.9,
118+
signal: "frontmatter status active/in-progress".into(),
119+
suggested_path: Some("01-Projects/".into()),
120+
};
121+
}
122+
123+
// Rule 5: Unchecked tasks → Project (80%)
124+
if content.contains("- [ ]") {
125+
return Classification {
126+
category: Category::Project,
127+
confidence: 0.8,
128+
signal: "unchecked tasks found".into(),
129+
suggested_path: Some("01-Projects/".into()),
130+
};
131+
}
132+
133+
// Rule 6: Status done/completed → Archive (85%)
134+
if fm.contains("status: done") || fm.contains("status: completed") {
135+
return Classification {
136+
category: Category::Archive,
137+
confidence: 0.85,
138+
signal: "frontmatter status done/completed".into(),
139+
suggested_path: Some("04-Archive/".into()),
140+
};
141+
}
142+
143+
// Rule 7: Person tag → Resource (90%)
144+
if fm.contains("- person") || fm.contains("- people") {
145+
return Classification {
146+
category: Category::Resource,
147+
confidence: 0.9,
148+
signal: "person/people tag in frontmatter".into(),
149+
suggested_path: Some("03-Resources/People/".into()),
150+
};
151+
}
152+
153+
// Rule 8: No edges + no recent mentions → Archive (75%)
154+
if edge_count == 0 && !has_recent_mentions {
155+
return Classification {
156+
category: Category::Archive,
157+
confidence: 0.75,
158+
signal: "no edges and no recent mentions".into(),
159+
suggested_path: Some("04-Archive/".into()),
160+
};
161+
}
162+
163+
// Rule 9: High edges + no tasks → Resource (70%)
164+
if edge_count >= 3 && !content.contains("- [ ]") {
165+
return Classification {
166+
category: Category::Resource,
167+
confidence: 0.7,
168+
signal: "high edge count with no open tasks".into(),
169+
suggested_path: Some("03-Resources/".into()),
170+
};
171+
}
172+
173+
// Rule 10: Area keywords in filename or first 200 chars of content
174+
let area_keywords = [
175+
"health", "finance", "career", "learning", "fitness", "nutrition", "budget",
176+
];
177+
let filename_lower = filename.to_lowercase();
178+
let content_prefix: String = content.chars().take(200).collect::<String>().to_lowercase();
179+
for keyword in &area_keywords {
180+
if filename_lower.contains(keyword) || content_prefix.contains(keyword) {
181+
return Classification {
182+
category: Category::Area,
183+
confidence: 0.6,
184+
signal: format!("area keyword '{keyword}' found"),
185+
suggested_path: Some("02-Areas/".into()),
186+
};
187+
}
188+
}
189+
190+
// Rule 11: Nothing matched → Uncertain
191+
Classification {
192+
category: Category::Uncertain,
193+
confidence: 0.0,
194+
signal: "no heuristic rules matched".into(),
195+
suggested_path: None,
196+
}
197+
}
198+
199+
/// Check if a basename matches the YYYY-MM-DD date pattern.
200+
fn is_daily_note(basename: &str) -> bool {
201+
let bytes = basename.as_bytes();
202+
if bytes.len() != 10 {
203+
return false;
204+
}
205+
// Check format: DDDD-DD-DD where D is digit
206+
bytes[4] == b'-'
207+
&& bytes[7] == b'-'
208+
&& bytes[0..4].iter().all(|b| b.is_ascii_digit())
209+
&& bytes[5..7].iter().all(|b| b.is_ascii_digit())
210+
&& bytes[8..10].iter().all(|b| b.is_ascii_digit())
211+
}
212+
213+
// ── Tests ──────────────────────────────────────────────────────
214+
215+
#[cfg(test)]
216+
mod tests {
217+
use super::*;
218+
219+
#[test]
220+
fn test_classify_project_by_status() {
221+
let c = classify_heuristic(
222+
"---\nstatus: active\n---\n# Sprint 6\n",
223+
"sprint-6.md",
224+
Some("status: active"),
225+
5,
226+
true,
227+
);
228+
assert_eq!(c.category, Category::Project);
229+
assert!(c.confidence >= 0.9);
230+
}
231+
232+
#[test]
233+
fn test_classify_project_by_tasks() {
234+
let c = classify_heuristic(
235+
"# Todo\n- [ ] Fix bug\n- [x] Done\n",
236+
"todo.md",
237+
None,
238+
2,
239+
true,
240+
);
241+
assert_eq!(c.category, Category::Project);
242+
assert!(c.confidence >= 0.8);
243+
}
244+
245+
#[test]
246+
fn test_classify_archive_by_status() {
247+
let c = classify_heuristic(
248+
"---\nstatus: done\n---\n# Old\n",
249+
"old.md",
250+
Some("status: done"),
251+
0,
252+
false,
253+
);
254+
assert_eq!(c.category, Category::Archive);
255+
}
256+
257+
#[test]
258+
fn test_classify_resource_person() {
259+
let c = classify_heuristic(
260+
"---\ntags:\n - person\n---\n# John\n",
261+
"john.md",
262+
Some("tags:\n - person"),
263+
3,
264+
true,
265+
);
266+
assert_eq!(c.category, Category::Resource);
267+
}
268+
269+
#[test]
270+
fn test_classify_area_keywords() {
271+
let c = classify_heuristic(
272+
"# Health\n\nTreadmill training\n",
273+
"health.md",
274+
None,
275+
2,
276+
true,
277+
);
278+
assert_eq!(c.category, Category::Area);
279+
}
280+
281+
#[test]
282+
fn test_skip_daily_note() {
283+
let c = classify_heuristic("# Daily\n", "2026-03-26.md", None, 0, true);
284+
assert_eq!(c.category, Category::Skip);
285+
}
286+
287+
#[test]
288+
fn test_skip_daily_note_in_folder() {
289+
let c = classify_heuristic("# Daily\n", "07-Daily/2026-03-26.md", None, 0, true);
290+
assert_eq!(c.category, Category::Skip);
291+
}
292+
293+
#[test]
294+
fn test_classify_archive_no_edges() {
295+
let c = classify_heuristic("# Random\nSome content\n", "random.md", None, 0, false);
296+
assert_eq!(c.category, Category::Archive);
297+
}
298+
299+
#[test]
300+
fn test_uncertain_when_ambiguous() {
301+
// Has edges and recent mentions, but no tasks, no status, no person tag, no area keywords.
302+
// edge_count=2 avoids Rule 9 (high edges >= 3 → Resource).
303+
let c = classify_heuristic(
304+
"# Meeting notes\nDiscussed roadmap\n",
305+
"meeting.md",
306+
None,
307+
2,
308+
true,
309+
);
310+
assert_eq!(c.category, Category::Uncertain);
311+
}
312+
313+
#[test]
314+
fn test_skip_template() {
315+
let c = classify_heuristic(
316+
"# Template\n",
317+
"05-Templates/Daily Note.md",
318+
None,
319+
0,
320+
false,
321+
);
322+
assert_eq!(c.category, Category::Skip);
323+
}
324+
}

0 commit comments

Comments
 (0)