Skip to content

Commit 4a4c1ea

Browse files
committed
feat(markdown): add section parser and robust frontmatter splitter
1 parent cb8f324 commit 4a4c1ea

2 files changed

Lines changed: 174 additions & 0 deletions

File tree

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ pub mod graph;
88
pub mod indexer;
99
pub mod links;
1010
pub mod llm;
11+
pub mod markdown;
1112
pub mod placement;
1213
pub mod profile;
1314
pub mod search;

src/markdown.rs

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
#[derive(Debug, Clone)]
2+
pub struct HeadingInfo {
3+
pub line: usize,
4+
pub level: u8,
5+
pub text: String,
6+
}
7+
8+
pub fn parse_headings(content: &str) -> Vec<HeadingInfo> {
9+
let mut headings = Vec::new();
10+
let mut in_code_block = false;
11+
for (i, line) in content.lines().enumerate() {
12+
let trimmed = line.trim();
13+
if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
14+
in_code_block = !in_code_block;
15+
continue;
16+
}
17+
if in_code_block {
18+
continue;
19+
}
20+
if let Some(rest) = trimmed.strip_prefix('#') {
21+
let hashes = rest.chars().take_while(|&c| c == '#').count();
22+
let level = 1 + hashes as u8;
23+
let after_hashes = &rest[hashes..];
24+
if level <= 6 && (after_hashes.is_empty() || after_hashes.starts_with(' ')) {
25+
let text = after_hashes.trim().trim_end_matches('#').trim();
26+
headings.push(HeadingInfo {
27+
line: i,
28+
level,
29+
text: text.to_string(),
30+
});
31+
}
32+
}
33+
}
34+
headings
35+
}
36+
37+
#[derive(Debug, Clone)]
38+
pub struct Section {
39+
pub heading: HeadingInfo,
40+
pub body_start: usize,
41+
pub body_end: usize,
42+
pub content: String,
43+
}
44+
45+
pub fn find_section(content: &str, heading_text: &str) -> Option<Section> {
46+
let headings = parse_headings(content);
47+
let target = heading_text.trim().to_lowercase();
48+
let lines: Vec<&str> = content.lines().collect();
49+
50+
let idx = headings.iter().position(|h| h.text.to_lowercase() == target)?;
51+
let h = &headings[idx];
52+
let body_start = h.line + 1;
53+
let body_end = headings[idx + 1..]
54+
.iter()
55+
.find(|next| next.level <= h.level)
56+
.map(|next| next.line)
57+
.unwrap_or(lines.len());
58+
59+
let content_str = lines[body_start..body_end].join("\n");
60+
Some(Section {
61+
heading: HeadingInfo {
62+
line: h.line,
63+
level: h.level,
64+
text: h.text.clone(),
65+
},
66+
body_start,
67+
body_end,
68+
content: content_str,
69+
})
70+
}
71+
72+
pub fn split_frontmatter(content: &str) -> (Option<String>, String) {
73+
let lines: Vec<&str> = content.lines().collect();
74+
if lines.first().map(|l| l.trim()) != Some("---") {
75+
return (None, content.to_string());
76+
}
77+
for (i, line) in lines.iter().enumerate().skip(1) {
78+
if line.trim() == "---" {
79+
let fm = lines[1..i].join("\n");
80+
let body = lines[i + 1..].join("\n");
81+
return (Some(fm), body);
82+
}
83+
}
84+
(None, content.to_string())
85+
}
86+
87+
#[cfg(test)]
88+
mod tests {
89+
use super::*;
90+
91+
#[test]
92+
fn test_parse_headings_basic() {
93+
let content = "# Title\n\nSome text\n\n## Section A\n\nContent\n\n## Section B\n";
94+
let headings = parse_headings(content);
95+
assert_eq!(headings.len(), 3);
96+
assert_eq!(headings[0].level, 1);
97+
assert_eq!(headings[0].text, "Title");
98+
assert_eq!(headings[1].level, 2);
99+
assert_eq!(headings[1].text, "Section A");
100+
}
101+
102+
#[test]
103+
fn test_parse_headings_ignores_code_blocks() {
104+
let content = "# Real\n\n```\n# Not a heading\n```\n\n## Also Real\n";
105+
let headings = parse_headings(content);
106+
assert_eq!(headings.len(), 2);
107+
assert_eq!(headings[0].text, "Real");
108+
assert_eq!(headings[1].text, "Also Real");
109+
}
110+
111+
#[test]
112+
fn test_parse_headings_strips_trailing_hashes() {
113+
let content = "## Heading ##\n";
114+
let headings = parse_headings(content);
115+
assert_eq!(headings[0].text, "Heading");
116+
}
117+
118+
#[test]
119+
fn test_find_section_basic() {
120+
let content = "# Title\n\n## Interactions\n\nEntry 1\nEntry 2\n\n## Links\n\nSome links\n";
121+
let section = find_section(content, "Interactions").unwrap();
122+
assert_eq!(section.heading.text, "Interactions");
123+
assert!(section.content.contains("Entry 1"));
124+
assert!(!section.content.contains("Some links"));
125+
}
126+
127+
#[test]
128+
fn test_find_section_case_insensitive() {
129+
let content = "## My Section\n\nContent\n";
130+
assert!(find_section(content, "my section").is_some());
131+
}
132+
133+
#[test]
134+
fn test_find_section_with_subsections() {
135+
let content = "# Title\n\n## Interactions\n\nEntry\n\n### Sub-detail\n\nMore\n\n## Links\n\nSome links\n";
136+
let section = find_section(content, "Interactions").unwrap();
137+
assert!(section.content.contains("Entry"));
138+
assert!(section.content.contains("Sub-detail"));
139+
assert!(!section.content.contains("Some links"));
140+
}
141+
142+
#[test]
143+
fn test_find_section_not_found() {
144+
let content = "## Existing\n\nContent\n";
145+
assert!(find_section(content, "Missing").is_none());
146+
}
147+
148+
#[test]
149+
fn test_split_frontmatter_valid() {
150+
let content = "---\ntitle: Test\ntags:\n - foo\n---\n\n# Body\n";
151+
let (fm, body) = split_frontmatter(content);
152+
assert!(fm.is_some());
153+
assert!(fm.unwrap().contains("title: Test"));
154+
assert!(body.contains("# Body"));
155+
}
156+
157+
#[test]
158+
fn test_split_frontmatter_none() {
159+
let content = "# No frontmatter\n\nJust content\n";
160+
let (fm, body) = split_frontmatter(content);
161+
assert!(fm.is_none());
162+
assert!(body.contains("No frontmatter"));
163+
}
164+
165+
#[test]
166+
fn test_parse_headings_ignores_inline_tags() {
167+
let content = "# Title\n\nSome text with #tag and #another-tag\n\n## Real Section\n";
168+
let headings = parse_headings(content);
169+
assert_eq!(headings.len(), 2);
170+
assert_eq!(headings[0].text, "Title");
171+
assert_eq!(headings[1].text, "Real Section");
172+
}
173+
}

0 commit comments

Comments
 (0)