1616import sys
1717import threading
1818import time
19+ import unicodedata
1920from pathlib import Path
2021
2122import litellm
@@ -215,14 +216,18 @@ async def _llm_call_async(model: str, messages: list[dict], step_name: str) -> s
215216
216217
217218def _parse_json (text : str ) -> list | dict :
218- """Parse JSON from LLM response, stripping markdown fences if present."""
219+ """Parse JSON from LLM response, handling fences, prose, and malformed JSON."""
220+ from json_repair import repair_json
219221 cleaned = text .strip ()
220222 if cleaned .startswith ("```" ):
221- first_nl = cleaned .index ("\n " )
222- cleaned = cleaned [first_nl + 1 :]
223+ first_nl = cleaned .find ("\n " )
224+ cleaned = cleaned [first_nl + 1 :] if first_nl != - 1 else cleaned [ 3 :]
223225 if cleaned .endswith ("```" ):
224226 cleaned = cleaned [:- 3 ]
225- return json .loads (cleaned .strip ())
227+ result = json .loads (repair_json (cleaned .strip ()))
228+ if not isinstance (result , (dict , list )):
229+ raise ValueError (f"Expected JSON object or array, got { type (result ).__name__ } " )
230+ return result
226231
227232
228233# ---------------------------------------------------------------------------
@@ -279,39 +284,82 @@ def _read_concept_briefs(wiki_dir: Path) -> str:
279284 return "\n " .join (lines ) or "(none yet)"
280285
281286
282- def _find_source_filename (doc_name : str , kb_dir : Path ) -> str :
283- """Find the original filename in raw/ for a given doc stem."""
284- raw_dir = kb_dir / "raw"
285- if raw_dir .exists ():
286- for f in raw_dir .iterdir ():
287- if f .stem == doc_name :
288- return f .name
289- return f"{ doc_name } .pdf"
290-
291-
292- def _write_summary (wiki_dir : Path , doc_name : str , source_file : str , summary : str ,
293- brief : str = "" , doc_type : str = "short" ) -> None :
294- """Write summary page with frontmatter.
295-
296- For short docs, includes a ``source_doc`` field linking to the full
297- source text in ``sources/{doc_name}.md``.
298- """
287+ def _get_section_bounds (lines : list [str ], heading : str ) -> tuple [int , int ] | None :
288+ """Return the [start, end) bounds for a Markdown H2 section."""
289+ for i , line in enumerate (lines ):
290+ if line == heading :
291+ start = i + 1
292+ end = len (lines )
293+ for j in range (start , len (lines )):
294+ if lines [j ].startswith ("## " ):
295+ end = j
296+ break
297+ return start , end
298+ return None
299+
300+
301+ def _section_contains_link (lines : list [str ], heading : str , link : str ) -> bool :
302+ """Check whether an index entry already exists inside the named section."""
303+ bounds = _get_section_bounds (lines , heading )
304+ if bounds is None :
305+ return False
306+
307+ start , end = bounds
308+ entry_prefix = f"- { link } "
309+ return any (line .startswith (entry_prefix ) for line in lines [start :end ])
310+
311+
312+ def _replace_section_entry (lines : list [str ], heading : str , link : str , entry : str ) -> bool :
313+ """Replace the first matching entry within a specific section."""
314+ bounds = _get_section_bounds (lines , heading )
315+ if bounds is None :
316+ return False
317+
318+ start , end = bounds
319+ entry_prefix = f"- { link } "
320+ for i in range (start , end ):
321+ if lines [i ].startswith (entry_prefix ):
322+ lines [i ] = entry
323+ return True
324+ return False
325+
326+
327+ def _insert_section_entry (lines : list [str ], heading : str , entry : str ) -> bool :
328+ """Insert a new entry at the top of a specific section."""
329+ bounds = _get_section_bounds (lines , heading )
330+ if bounds is None :
331+ return False
332+
333+ start , _ = bounds
334+ lines .insert (start , entry )
335+ return True
336+
337+
338+
339+ def _write_summary (wiki_dir : Path , doc_name : str , summary : str ,
340+ doc_type : str = "short" ) -> None :
341+ """Write summary page with frontmatter."""
342+ if summary .startswith ("---" ):
343+ end = summary .find ("---" , 3 )
344+ if end != - 1 :
345+ summary = summary [end + 3 :].lstrip ("\n " )
299346 summaries_dir = wiki_dir / "summaries"
300347 summaries_dir .mkdir (parents = True , exist_ok = True )
301- fm_lines = [ f"sources: [ { source_file } ]" ]
302- if brief :
303- fm_lines . append ( f"brief : { brief } " )
304- if doc_type == "short" :
305- fm_lines . append ( f"source_doc: sources/ { doc_name } .md" )
348+ ext = "md" if doc_type == "short" else "json"
349+ fm_lines = [
350+ f"doc_type : { doc_type } " ,
351+ f"full_text: sources/ { doc_name } . { ext } " ,
352+ ]
306353 frontmatter = "---\n " + "\n " .join (fm_lines ) + "\n ---\n \n "
307354 (summaries_dir / f"{ doc_name } .md" ).write_text (frontmatter + summary , encoding = "utf-8" )
308355
309356
310- _SAFE_NAME_RE = re .compile (r'[^a-zA-Z0-9_ \-]' )
357+ _SAFE_NAME_RE = re .compile (r'[^\w \-]' )
311358
312359
313360def _sanitize_concept_name (name : str ) -> str :
314361 """Sanitize a concept name for safe use as a filename."""
362+ name = unicodedata .normalize ("NFKC" , name )
315363 sanitized = _SAFE_NAME_RE .sub ("-" , name ).strip ("-" )
316364 return sanitized or "unnamed-concept"
317365
@@ -341,7 +389,21 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
341389 existing = fm + body
342390 else :
343391 existing = f"---\n sources: [{ source_file } ]\n ---\n \n " + existing
344- existing += f"\n \n { content } "
392+ # Strip frontmatter from LLM content to avoid duplicate blocks
393+ clean = content
394+ if clean .startswith ("---" ):
395+ end = clean .find ("---" , 3 )
396+ if end != - 1 :
397+ clean = clean [end + 3 :].lstrip ("\n " )
398+ # Replace body with LLM rewrite (prompt asks for full rewrite, not delta)
399+ if existing .startswith ("---" ):
400+ end = existing .find ("---" , 3 )
401+ if end != - 1 :
402+ existing = existing [:end + 3 ] + "\n \n " + clean
403+ else :
404+ existing = clean
405+ else :
406+ existing = clean
345407 if brief and existing .startswith ("---" ):
346408 end = existing .find ("---" , 3 )
347409 if end != - 1 :
@@ -354,6 +416,10 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
354416 existing = fm + body
355417 path .write_text (existing , encoding = "utf-8" )
356418 else :
419+ if content .startswith ("---" ):
420+ end = content .find ("---" , 3 )
421+ if end != - 1 :
422+ content = content [end + 3 :].lstrip ("\n " )
357423 fm_lines = [f"sources: [{ source_file } ]" ]
358424 if brief :
359425 fm_lines .append (f"brief: { brief } " )
@@ -445,7 +511,6 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str])
445511 text += f"\n \n ## Related Documents\n - { link } \n "
446512 path .write_text (text , encoding = "utf-8" )
447513
448-
449514def _update_index (
450515 wiki_dir : Path , doc_name : str , concept_names : list [str ],
451516 doc_brief : str = "" , concept_briefs : dict [str , str ] | None = None ,
@@ -454,8 +519,9 @@ def _update_index(
454519 """Append document and concept entries to index.md.
455520
456521 When ``doc_brief`` or entries in ``concept_briefs`` are provided, entries
457- are written as ``- [[link]] (type) — brief text``. Existing entries are
458- detected by the link part only, so updating a brief on a re-compile works.
522+ are written as ``- [[link]] (type) — brief text``. Existing entries are
523+ detected within their own section by exact entry prefix and skipped to
524+ avoid duplicates.
459525 ``doc_type`` is ``"short"`` or ``"pageindex"`` — shown in the entry so the
460526 query agent knows how to access detailed content.
461527 """
@@ -469,26 +535,27 @@ def _update_index(
469535 encoding = "utf-8" ,
470536 )
471537
472- text = index_path .read_text (encoding = "utf-8" )
538+ lines = index_path .read_text (encoding = "utf-8" ). split ( " \n " )
473539
474540 doc_link = f"[[summaries/{ doc_name } ]]"
475- if doc_link not in text :
541+ if not _section_contains_link ( lines , "## Documents" , doc_link ) :
476542 doc_entry = f"- { doc_link } ({ doc_type } )"
477543 if doc_brief :
478544 doc_entry += f" — { doc_brief } "
479- if "## Documents" in text :
480- text = text .replace ("## Documents\n " , f"## Documents\n { doc_entry } \n " , 1 )
545+ _insert_section_entry (lines , "## Documents" , doc_entry )
481546
482547 for name in concept_names :
483548 concept_link = f"[[concepts/{ name } ]]"
484- if concept_link not in text :
485- concept_entry = f"- { concept_link } "
549+ concept_entry = f"- { concept_link } "
550+ if name in concept_briefs :
551+ concept_entry += f" — { concept_briefs [name ]} "
552+ if _section_contains_link (lines , "## Concepts" , concept_link ):
486553 if name in concept_briefs :
487- concept_entry += f" — { concept_briefs [ name ] } "
488- if "## Concepts" in text :
489- text = text . replace ( "## Concepts \n " , f "## Concepts\n { concept_entry } \n " , 1 )
554+ _replace_section_entry ( lines , "## Concepts" , concept_link , concept_entry )
555+ else :
556+ _insert_section_entry ( lines , "## Concepts" , concept_entry )
490557
491- index_path .write_text (text , encoding = "utf-8" )
558+ index_path .write_text (" \n " . join ( lines ) , encoding = "utf-8" )
492559
493560
494561# ---------------------------------------------------------------------------
@@ -515,7 +582,7 @@ async def _compile_concepts(
515582 Uses ``_CONCEPTS_PLAN_USER`` to get a plan with create/update/related
516583 actions, then executes each action type accordingly.
517584 """
518- source_file = _find_source_filename ( doc_name , kb_dir )
585+ source_file = f"summaries/ { doc_name } .md"
519586
520587 # --- Step 2: Get concepts plan (A cached) ---
521588 concept_briefs = _read_concept_briefs (wiki_dir )
@@ -534,7 +601,7 @@ async def _compile_concepts(
534601 except (json .JSONDecodeError , ValueError ) as exc :
535602 logger .warning ("Failed to parse concepts plan: %s" , exc )
536603 logger .debug ("Raw: %s" , plan_raw )
537- _update_index (wiki_dir , doc_name , [])
604+ _update_index (wiki_dir , doc_name , [], doc_brief = doc_brief , doc_type = doc_type )
538605 return
539606
540607 # Fallback: if LLM returns a flat list, treat all items as "create"
@@ -552,7 +619,7 @@ async def _compile_concepts(
552619 related_items = plan ["related" ]
553620
554621 if not create_items and not update_items and not related_items :
555- _update_index (wiki_dir , doc_name , [])
622+ _update_index (wiki_dir , doc_name , [], doc_brief = doc_brief , doc_type = doc_type )
556623 return
557624
558625 # --- Step 3: Generate/update concept pages concurrently (A cached) ---
@@ -570,7 +637,7 @@ async def _gen_create(concept: dict) -> tuple[str, str, bool, str]:
570637 title = title , doc_name = doc_name ,
571638 update_instruction = "" ,
572639 )},
573- ], f"concept:{ name } " )
640+ ], f"concept: { name } " )
574641 try :
575642 parsed = _parse_json (raw )
576643 brief = parsed .get ("brief" , "" )
@@ -582,7 +649,7 @@ async def _gen_create(concept: dict) -> tuple[str, str, bool, str]:
582649 async def _gen_update (concept : dict ) -> tuple [str , str , bool , str ]:
583650 name = concept ["name" ]
584651 title = concept .get ("title" , name )
585- concept_path = wiki_dir / "concepts" / f"{ name } .md"
652+ concept_path = wiki_dir / "concepts" / f"{ _sanitize_concept_name ( name ) } .md"
586653 if concept_path .exists ():
587654 raw_text = concept_path .read_text (encoding = "utf-8" )
588655 if raw_text .startswith ("---" ):
@@ -601,7 +668,7 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]:
601668 title = title , doc_name = doc_name ,
602669 existing_content = existing_content ,
603670 )},
604- ], f"update:{ name } " )
671+ ], f"update: { name } " )
605672 try :
606673 parsed = _parse_json (raw )
607674 brief = parsed .get ("brief" , "" )
@@ -630,16 +697,18 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]:
630697 continue
631698 name , page_content , is_update , brief = r
632699 _write_concept (wiki_dir , name , page_content , source_file , is_update , brief = brief )
633- concept_names .append (name )
700+ safe_name = _sanitize_concept_name (name )
701+ concept_names .append (safe_name )
634702 if brief :
635- concept_briefs_map [name ] = brief
703+ concept_briefs_map [safe_name ] = brief
636704
637705 # --- Step 3b: Process related items (code only, no LLM) ---
638- for slug in related_items :
706+ sanitized_related = [_sanitize_concept_name (s ) for s in related_items ]
707+ for slug in sanitized_related :
639708 _add_related_link (wiki_dir , slug , doc_name , source_file )
640709
641710 # --- Step 3c: Backlink — summary ↔ concepts (code only) ---
642- all_concept_slugs = concept_names + [ s for s in related_items ]
711+ all_concept_slugs = concept_names + sanitized_related
643712 if all_concept_slugs :
644713 _backlink_summary (wiki_dir , doc_name , all_concept_slugs )
645714 _backlink_concepts (wiki_dir , doc_name , all_concept_slugs )
@@ -670,7 +739,6 @@ async def compile_short_doc(
670739
671740 wiki_dir = kb_dir / "wiki"
672741 schema_md = get_agents_md (wiki_dir )
673- source_file = _find_source_filename (doc_name , kb_dir )
674742 content = source_path .read_text (encoding = "utf-8" )
675743
676744 # Base context A: system + document
@@ -690,7 +758,7 @@ async def compile_short_doc(
690758 except (json .JSONDecodeError , ValueError ):
691759 doc_brief = ""
692760 summary = summary_raw
693- _write_summary (wiki_dir , doc_name , source_file , summary , brief = doc_brief )
761+ _write_summary (wiki_dir , doc_name , summary )
694762
695763 # --- Steps 2-4: Concept plan → generate/update → index ---
696764 await _compile_concepts (
0 commit comments