Skip to content

Commit 8640681

Browse files
committed
feat: add _read_concept_briefs for concept dedup context
1 parent cc12d95 commit 8640681

2 files changed

Lines changed: 115 additions & 4 deletions

File tree

openkb/agent/compiler.py

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@
1212
import asyncio
1313
import json
1414
import logging
15+
import re
1516
import sys
17+
import threading
1618
import time
1719
from pathlib import Path
1820

@@ -95,9 +97,6 @@
9597
# LLM helpers
9698
# ---------------------------------------------------------------------------
9799

98-
import threading
99-
100-
101100
class _Spinner:
102101
"""Animated dots spinner that runs in a background thread."""
103102

@@ -208,6 +207,37 @@ def _read_wiki_context(wiki_dir: Path) -> tuple[str, list[str]]:
208207
return index_content, existing
209208

210209

210+
def _read_concept_briefs(wiki_dir: Path) -> str:
211+
"""Read existing concept pages and return compact one-line summaries.
212+
213+
For each concept, skips YAML frontmatter, takes the first 150 chars of the
214+
body (newlines collapsed to spaces), and formats as ``- {slug}: {brief}``.
215+
216+
Returns "(none yet)" if the concepts directory is missing or empty.
217+
"""
218+
concepts_dir = wiki_dir / "concepts"
219+
if not concepts_dir.exists():
220+
return "(none yet)"
221+
222+
md_files = sorted(concepts_dir.glob("*.md"))
223+
if not md_files:
224+
return "(none yet)"
225+
226+
lines: list[str] = []
227+
for path in md_files:
228+
text = path.read_text(encoding="utf-8")
229+
# Strip YAML frontmatter if present
230+
if text.startswith("---"):
231+
end = text.find("---", 3)
232+
if end != -1:
233+
text = text[end + 3:]
234+
body = text.strip().replace("\n", " ")
235+
brief = body[:150]
236+
lines.append(f"- {path.stem}: {brief}")
237+
238+
return "\n".join(lines)
239+
240+
211241
def _find_source_filename(doc_name: str, kb_dir: Path) -> str:
212242
"""Find the original filename in raw/ for a given doc stem."""
213243
raw_dir = kb_dir / "raw"
@@ -226,11 +256,24 @@ def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str
226256
(summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8")
227257

228258

259+
_SAFE_NAME_RE = re.compile(r'[^a-zA-Z0-9_\-]')
260+
261+
262+
def _sanitize_concept_name(name: str) -> str:
263+
"""Sanitize a concept name for safe use as a filename."""
264+
sanitized = _SAFE_NAME_RE.sub("-", name).strip("-")
265+
return sanitized or "unnamed-concept"
266+
267+
229268
def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is_update: bool) -> None:
230269
"""Write or update a concept page, managing the sources frontmatter."""
231270
concepts_dir = wiki_dir / "concepts"
232271
concepts_dir.mkdir(parents=True, exist_ok=True)
233-
path = concepts_dir / f"{name}.md"
272+
safe_name = _sanitize_concept_name(name)
273+
path = (concepts_dir / f"{safe_name}.md").resolve()
274+
if not path.is_relative_to(concepts_dir.resolve()):
275+
logger.warning("Concept name escapes concepts dir: %s", name)
276+
return
234277

235278
if is_update and path.exists():
236279
existing = path.read_text(encoding="utf-8")
@@ -241,7 +284,11 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
241284
body = existing[end + 3:]
242285
if "sources:" in fm:
243286
fm = fm.replace("sources: [", f"sources: [{source_file}, ")
287+
else:
288+
fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
244289
existing = fm + body
290+
else:
291+
existing = f"---\nsources: [{source_file}]\n---\n\n" + existing
245292
existing += f"\n\n{content}"
246293
path.write_text(existing, encoding="utf-8")
247294
else:

tests/test_compiler.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
_write_concept,
1616
_update_index,
1717
_read_wiki_context,
18+
_read_concept_briefs,
1819
)
1920

2021

@@ -116,6 +117,69 @@ def test_with_content(self, tmp_path):
116117
assert concepts == ["attention", "transformer"]
117118

118119

120+
class TestReadConceptBriefs:
121+
def test_empty_wiki(self, tmp_path):
122+
wiki = tmp_path / "wiki"
123+
wiki.mkdir()
124+
(wiki / "concepts").mkdir()
125+
assert _read_concept_briefs(wiki) == "(none yet)"
126+
127+
def test_no_concepts_dir(self, tmp_path):
128+
wiki = tmp_path / "wiki"
129+
wiki.mkdir()
130+
assert _read_concept_briefs(wiki) == "(none yet)"
131+
132+
def test_reads_briefs_with_frontmatter(self, tmp_path):
133+
wiki = tmp_path / "wiki"
134+
concepts = wiki / "concepts"
135+
concepts.mkdir(parents=True)
136+
(concepts / "attention.md").write_text(
137+
"---\nsources: [paper.pdf]\n---\n\nAttention is a mechanism that allows models to focus on relevant parts.",
138+
encoding="utf-8",
139+
)
140+
result = _read_concept_briefs(wiki)
141+
assert "- attention:" in result
142+
assert "Attention is a mechanism" in result
143+
assert "sources" not in result
144+
assert "---" not in result
145+
146+
def test_reads_briefs_without_frontmatter(self, tmp_path):
147+
wiki = tmp_path / "wiki"
148+
concepts = wiki / "concepts"
149+
concepts.mkdir(parents=True)
150+
(concepts / "transformer.md").write_text(
151+
"Transformer is a neural network architecture based on attention.",
152+
encoding="utf-8",
153+
)
154+
result = _read_concept_briefs(wiki)
155+
assert "- transformer:" in result
156+
assert "Transformer is a neural network" in result
157+
158+
def test_truncates_long_content(self, tmp_path):
159+
wiki = tmp_path / "wiki"
160+
concepts = wiki / "concepts"
161+
concepts.mkdir(parents=True)
162+
long_body = "A" * 300
163+
(concepts / "longconcept.md").write_text(long_body, encoding="utf-8")
164+
result = _read_concept_briefs(wiki)
165+
# The brief part should be truncated at 150 chars
166+
brief = result.split("- longconcept: ", 1)[1]
167+
assert len(brief) == 150
168+
assert brief == "A" * 150
169+
170+
def test_sorted_alphabetically(self, tmp_path):
171+
wiki = tmp_path / "wiki"
172+
concepts = wiki / "concepts"
173+
concepts.mkdir(parents=True)
174+
(concepts / "zebra.md").write_text("Zebra concept.", encoding="utf-8")
175+
(concepts / "apple.md").write_text("Apple concept.", encoding="utf-8")
176+
(concepts / "mango.md").write_text("Mango concept.", encoding="utf-8")
177+
result = _read_concept_briefs(wiki)
178+
lines = result.strip().splitlines()
179+
slugs = [line.split(":")[0].lstrip("- ") for line in lines]
180+
assert slugs == ["apple", "mango", "zebra"]
181+
182+
119183
def _mock_completion(responses: list[str]):
120184
"""Create a mock for litellm.completion that returns responses in order."""
121185
call_count = {"n": 0}

0 commit comments

Comments
 (0)