Skip to content

Commit 5b086a5

Browse files
committed
feat: wire brief+content JSON through compile pipeline to index and frontmatter
1 parent ca23912 commit 5b086a5

3 files changed

Lines changed: 120 additions & 22 deletions

File tree

openkb/agent/compiler.py

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,7 @@ async def _compile_concepts(
497497
summary: str,
498498
doc_name: str,
499499
max_concurrency: int,
500+
doc_brief: str = "",
500501
) -> None:
501502
"""Shared Steps 2-4: concepts plan → generate/update → index.
502503
@@ -546,11 +547,11 @@ async def _compile_concepts(
546547
# --- Step 3: Generate/update concept pages concurrently (A cached) ---
547548
semaphore = asyncio.Semaphore(max_concurrency)
548549

549-
async def _gen_create(concept: dict) -> tuple[str, str, bool]:
550+
async def _gen_create(concept: dict) -> tuple[str, str, bool, str]:
550551
name = concept["name"]
551552
title = concept.get("title", name)
552553
async with semaphore:
553-
page_content = await _llm_call_async(model, [
554+
raw = await _llm_call_async(model, [
554555
system_msg,
555556
doc_msg,
556557
{"role": "assistant", "content": summary},
@@ -559,9 +560,15 @@ async def _gen_create(concept: dict) -> tuple[str, str, bool]:
559560
update_instruction="",
560561
)},
561562
], f"concept:{name}")
562-
return name, page_content, False
563-
564-
async def _gen_update(concept: dict) -> tuple[str, str, bool]:
563+
try:
564+
parsed = _parse_json(raw)
565+
brief = parsed.get("brief", "")
566+
content = parsed.get("content", raw)
567+
except (json.JSONDecodeError, ValueError):
568+
brief, content = "", raw
569+
return name, content, False, brief
570+
571+
async def _gen_update(concept: dict) -> tuple[str, str, bool, str]:
565572
name = concept["name"]
566573
title = concept.get("title", name)
567574
concept_path = wiki_dir / "concepts" / f"{name}.md"
@@ -575,7 +582,7 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]:
575582
else:
576583
existing_content = "(page not found — create from scratch)"
577584
async with semaphore:
578-
page_content = await _llm_call_async(model, [
585+
raw = await _llm_call_async(model, [
579586
system_msg,
580587
doc_msg,
581588
{"role": "assistant", "content": summary},
@@ -584,13 +591,20 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]:
584591
existing_content=existing_content,
585592
)},
586593
], f"update:{name}")
587-
return name, page_content, True
594+
try:
595+
parsed = _parse_json(raw)
596+
brief = parsed.get("brief", "")
597+
content = parsed.get("content", raw)
598+
except (json.JSONDecodeError, ValueError):
599+
brief, content = "", raw
600+
return name, content, True, brief
588601

589602
tasks = []
590603
tasks.extend(_gen_create(c) for c in create_items)
591604
tasks.extend(_gen_update(c) for c in update_items)
592605

593606
concept_names: list[str] = []
607+
concept_briefs_map: dict[str, str] = {}
594608

595609
if tasks:
596610
total = len(tasks)
@@ -603,9 +617,11 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]:
603617
if isinstance(r, Exception):
604618
logger.warning("Concept generation failed: %s", r)
605619
continue
606-
name, page_content, is_update = r
607-
_write_concept(wiki_dir, name, page_content, source_file, is_update)
620+
name, page_content, is_update, brief = r
621+
_write_concept(wiki_dir, name, page_content, source_file, is_update, brief=brief)
608622
concept_names.append(name)
623+
if brief:
624+
concept_briefs_map[name] = brief
609625

610626
# --- Step 3b: Process related items (code only, no LLM) ---
611627
for slug in related_items:
@@ -618,7 +634,8 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool]:
618634
_backlink_concepts(wiki_dir, doc_name, all_concept_slugs)
619635

620636
# --- Step 4: Update index (code only) ---
621-
_update_index(wiki_dir, doc_name, concept_names)
637+
_update_index(wiki_dir, doc_name, concept_names,
638+
doc_brief=doc_brief, concept_briefs=concept_briefs_map)
622639

623640

624641
async def compile_short_doc(
@@ -653,13 +670,20 @@ async def compile_short_doc(
653670
)}
654671

655672
# --- Step 1: Generate summary ---
656-
summary = _llm_call(model, [system_msg, doc_msg], "summary")
657-
_write_summary(wiki_dir, doc_name, source_file, summary)
673+
summary_raw = _llm_call(model, [system_msg, doc_msg], "summary")
674+
try:
675+
summary_parsed = _parse_json(summary_raw)
676+
doc_brief = summary_parsed.get("brief", "")
677+
summary = summary_parsed.get("content", summary_raw)
678+
except (json.JSONDecodeError, ValueError):
679+
doc_brief = ""
680+
summary = summary_raw
681+
_write_summary(wiki_dir, doc_name, source_file, summary, brief=doc_brief)
658682

659683
# --- Steps 2-4: Concept plan → generate/update → index ---
660684
await _compile_concepts(
661685
wiki_dir, kb_dir, model, system_msg, doc_msg,
662-
summary, doc_name, max_concurrency,
686+
summary, doc_name, max_concurrency, doc_brief=doc_brief,
663687
)
664688

665689

@@ -669,6 +693,7 @@ async def compile_long_doc(
669693
doc_id: str,
670694
kb_dir: Path,
671695
model: str,
696+
doc_description: str = "",
672697
max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
673698
) -> None:
674699
"""Compile a long (PageIndex) document's concepts and index.
@@ -700,5 +725,5 @@ async def compile_long_doc(
700725
# --- Steps 2-4: Concept plan → generate/update → index ---
701726
await _compile_concepts(
702727
wiki_dir, kb_dir, model, system_msg, doc_msg,
703-
overview, doc_name, max_concurrency,
728+
overview, doc_name, max_concurrency, doc_brief=doc_description,
704729
)

openkb/cli.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,8 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None:
132132
for attempt in range(2):
133133
try:
134134
asyncio.run(
135-
compile_long_doc(doc_name, summary_path, index_result.doc_id, kb_dir, model)
135+
compile_long_doc(doc_name, summary_path, index_result.doc_id, kb_dir, model,
136+
doc_description=index_result.description)
136137
)
137138
break
138139
except Exception as exc:

tests/test_compiler.py

Lines changed: 79 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -487,13 +487,19 @@ async def test_full_pipeline(self, tmp_path):
487487
(tmp_path / "raw").mkdir()
488488
(tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake")
489489

490-
summary_response = "# Summary\n\nThis document discusses transformers."
490+
summary_response = json.dumps({
491+
"brief": "Discusses transformers",
492+
"content": "# Summary\n\nThis document discusses transformers.",
493+
})
491494
concepts_list_response = json.dumps({
492495
"create": [{"name": "transformer", "title": "Transformer"}],
493496
"update": [],
494497
"related": [],
495498
})
496-
concept_page_response = "# Transformer\n\nA neural network architecture."
499+
concept_page_response = json.dumps({
500+
"brief": "NN architecture using self-attention",
501+
"content": "# Transformer\n\nA neural network architecture.",
502+
})
497503

498504
with patch("openkb.agent.compiler.litellm") as mock_litellm:
499505
mock_litellm.completion = MagicMock(
@@ -534,7 +540,7 @@ async def test_handles_bad_json(self, tmp_path):
534540

535541
with patch("openkb.agent.compiler.litellm") as mock_litellm:
536542
mock_litellm.completion = MagicMock(
537-
side_effect=_mock_completion(["Summary text", "not valid json"])
543+
side_effect=_mock_completion(["Plain summary text", "not valid json"])
538544
)
539545
# Should not raise
540546
await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini")
@@ -567,7 +573,10 @@ async def test_full_pipeline(self, tmp_path):
567573
"update": [],
568574
"related": [],
569575
})
570-
concept_page_response = "# Deep Learning\n\nA subfield of ML."
576+
concept_page_response = json.dumps({
577+
"brief": "Subfield of ML using neural networks",
578+
"content": "# Deep Learning\n\nA subfield of ML.",
579+
})
571580

572581
with patch("openkb.agent.compiler.litellm") as mock_litellm:
573582
mock_litellm.completion = MagicMock(
@@ -624,8 +633,14 @@ async def test_create_and_update_flow(self, tmp_path):
624633
"update": [{"name": "attention", "title": "Attention"}],
625634
"related": [],
626635
})
627-
create_page_response = "# Flash Attention\n\nAn efficient attention algorithm."
628-
update_page_response = "# Attention\n\nUpdated content with new info."
636+
create_page_response = json.dumps({
637+
"brief": "Efficient attention algorithm",
638+
"content": "# Flash Attention\n\nAn efficient attention algorithm.",
639+
})
640+
update_page_response = json.dumps({
641+
"brief": "Updated attention mechanism",
642+
"content": "# Attention\n\nUpdated content with new info.",
643+
})
629644

630645
system_msg = {"role": "system", "content": "You are a wiki agent."}
631646
doc_msg = {"role": "user", "content": "Document about attention mechanisms."}
@@ -720,7 +735,10 @@ async def test_fallback_list_format(self, tmp_path):
720735
plan_response = json.dumps([
721736
{"name": "attention", "title": "Attention"},
722737
])
723-
concept_page_response = "# Attention\n\nA mechanism for focusing."
738+
concept_page_response = json.dumps({
739+
"brief": "A mechanism for focusing",
740+
"content": "# Attention\n\nA mechanism for focusing.",
741+
})
724742

725743
system_msg = {"role": "system", "content": "You are a wiki agent."}
726744
doc_msg = {"role": "user", "content": "Document content."}
@@ -744,3 +762,57 @@ async def test_fallback_list_format(self, tmp_path):
744762
att_text = att_path.read_text()
745763
assert "sources: [test-doc.pdf]" in att_text
746764
assert "Attention" in att_text
765+
766+
767+
class TestBriefIntegration:
768+
@pytest.mark.asyncio
769+
async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path):
770+
wiki = tmp_path / "wiki"
771+
(wiki / "sources").mkdir(parents=True)
772+
(wiki / "summaries").mkdir(parents=True)
773+
(wiki / "concepts").mkdir(parents=True)
774+
(wiki / "index.md").write_text(
775+
"# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
776+
encoding="utf-8",
777+
)
778+
source_path = wiki / "sources" / "test-doc.md"
779+
source_path.write_text("# Test Doc\n\nContent.", encoding="utf-8")
780+
(tmp_path / ".openkb").mkdir()
781+
(tmp_path / "raw").mkdir()
782+
(tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake")
783+
784+
summary_resp = json.dumps({
785+
"brief": "A paper about transformers",
786+
"content": "# Summary\n\nThis paper discusses transformers.",
787+
})
788+
plan_resp = json.dumps({
789+
"create": [{"name": "transformer", "title": "Transformer"}],
790+
"update": [],
791+
"related": [],
792+
})
793+
concept_resp = json.dumps({
794+
"brief": "NN architecture using self-attention",
795+
"content": "# Transformer\n\nA neural network architecture.",
796+
})
797+
798+
with patch("openkb.agent.compiler.litellm") as mock_litellm:
799+
mock_litellm.completion = MagicMock(
800+
side_effect=_mock_completion([summary_resp, plan_resp])
801+
)
802+
mock_litellm.acompletion = AsyncMock(
803+
side_effect=_mock_acompletion([concept_resp])
804+
)
805+
await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini")
806+
807+
# Summary frontmatter has brief
808+
summary_text = (wiki / "summaries" / "test-doc.md").read_text()
809+
assert "brief: A paper about transformers" in summary_text
810+
811+
# Concept frontmatter has brief
812+
concept_text = (wiki / "concepts" / "transformer.md").read_text()
813+
assert "brief: NN architecture using self-attention" in concept_text
814+
815+
# Index has briefs
816+
index_text = (wiki / "index.md").read_text()
817+
assert "— A paper about transformers" in index_text
818+
assert "— NN architecture using self-attention" in index_text

0 commit comments

Comments
 (0)