VectifyAI
diff --git a/‎openkb/agent/compiler.py‎
Lines changed: 133 additions & 137 deletions b/‎openkb/agent/compiler.py‎
Lines changed: 133 additions & 137 deletions
@@ -396,127 +396,144 @@ def _update_index(wiki_dir: Path, doc_name: str, concept_names: list[str]) -> No
 DEFAULT_COMPILE_CONCURRENCY = 5
 
 
-async def compile_short_doc(
-    doc_name: str,
-    source_path: Path,
+async def _compile_concepts(
+    wiki_dir: Path,
     kb_dir: Path,
     model: str,
-    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
+    system_msg: dict,
+    doc_msg: dict,
+    summary: str,
+    doc_name: str,
+    max_concurrency: int,
 ) -> None:
-    """Compile a short document using a multi-step LLM pipeline with caching.
+    """Shared Steps 2-4: concepts plan → generate/update → index.
 
-    Step 1: Build base context A (schema + doc content).
-    Step 2: A → generate summary.
-    Step 3: A + summary → extract concept list.
-    Step 4: Concurrent LLM calls (A cached) → generate each concept page.
-    Step 5: Code writes files, updates index.
+    Uses ``_CONCEPTS_PLAN_USER`` to get a plan with create/update/related
+    actions, then executes each action type accordingly.
     """
-    from openkb.config import load_config
-
-    openkb_dir = kb_dir / ".openkb"
-    config = load_config(openkb_dir / "config.yaml")
-    language: str = config.get("language", "en")
-
-    wiki_dir = kb_dir / "wiki"
-    schema_md = get_agents_md(wiki_dir)
     source_file = _find_source_filename(doc_name, kb_dir)
-    content = source_path.read_text(encoding="utf-8")
 
-    # Base context A: system + document
-    system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format(
-        schema_md=schema_md, language=language,
-    )}
-    doc_msg = {"role": "user", "content": _SUMMARY_USER.format(
-        doc_name=doc_name, content=content,
-    )}
+    # --- Step 2: Get concepts plan (A cached) ---
+    concept_briefs = _read_concept_briefs(wiki_dir)
 
-    # --- Step 1: Generate summary ---
-    summary = _llm_call(model, [system_msg, doc_msg], "summary")
-    _write_summary(wiki_dir, doc_name, source_file, summary)
-
-    # --- Step 2: Extract concept list (A cached) ---
-    _, existing_concepts = _read_wiki_context(wiki_dir)
-
-    concepts_list_raw = _llm_call(model, [
+    plan_raw = _llm_call(model, [
         system_msg,
         doc_msg,
         {"role": "assistant", "content": summary},
-        {"role": "user", "content": _CONCEPTS_LIST_USER.format(
-            existing_concepts=", ".join(existing_concepts) if existing_concepts else "(none yet)",
+        {"role": "user", "content": _CONCEPTS_PLAN_USER.format(
+            concept_briefs=concept_briefs,
         )},
-    ], "concepts-list", max_tokens=512)
+    ], "concepts-plan", max_tokens=1024)
 
     try:
-        concepts_list = _parse_json(concepts_list_raw)
+        parsed = _parse_json(plan_raw)
     except (json.JSONDecodeError, ValueError) as exc:
-        logger.warning("Failed to parse concepts list: %s", exc)
-        logger.debug("Raw: %s", concepts_list_raw)
+        logger.warning("Failed to parse concepts plan: %s", exc)
+        logger.debug("Raw: %s", plan_raw)
         _update_index(wiki_dir, doc_name, [])
         return
 
-    if not concepts_list:
+    # Fallback: if LLM returns a flat list, treat all items as "create"
+    if isinstance(parsed, list):
+        plan = {"create": parsed, "update": [], "related": []}
+    else:
+        plan = {
+            "create": parsed.get("create", []),
+            "update": parsed.get("update", []),
+            "related": parsed.get("related", []),
+        }
+
+    create_items = plan["create"]
+    update_items = plan["update"]
+    related_items = plan["related"]
+
+    if not create_items and not update_items and not related_items:
         _update_index(wiki_dir, doc_name, [])
         return
 
-    # --- Step 3: Generate concept pages concurrently (A cached) ---
+    # --- Step 3: Generate/update concept pages concurrently (A cached) ---
     semaphore = asyncio.Semaphore(max_concurrency)
 
-    async def _gen_concept(concept: dict) -> tuple[str, str, bool]:
+    async def _gen_create(concept: dict) -> tuple[str, str, bool]:
         name = concept["name"]
         title = concept.get("title", name)
-        is_update = concept.get("is_update", False)
-        update_instruction = (
-            "This concept page already exists. Add new information from this document "
-            "without duplicating existing content."
-            if is_update else ""
-        )
-
         async with semaphore:
             page_content = await _llm_call_async(model, [
                 system_msg,
                 doc_msg,
                 {"role": "assistant", "content": summary},
                 {"role": "user", "content": _CONCEPT_PAGE_USER.format(
                     title=title, doc_name=doc_name,
-                    update_instruction=update_instruction,
+                    update_instruction="",
                 )},
             ], f"concept:{name}")
+        return name, page_content, False
 
-        return name, page_content, is_update
+    async def _gen_update(concept: dict) -> tuple[str, str, bool]:
+        name = concept["name"]
+        title = concept.get("title", name)
+        concept_path = wiki_dir / "concepts" / f"{name}.md"
+        if concept_path.exists():
+            raw_text = concept_path.read_text(encoding="utf-8")
+            if raw_text.startswith("---"):
+                parts = raw_text.split("---", 2)
+                existing_content = parts[2].strip() if len(parts) >= 3 else raw_text
+            else:
+                existing_content = raw_text
+        else:
+            existing_content = "(page not found — create from scratch)"
+        async with semaphore:
+            page_content = await _llm_call_async(model, [
+                system_msg,
+                doc_msg,
+                {"role": "assistant", "content": summary},
+                {"role": "user", "content": _CONCEPT_UPDATE_USER.format(
+                    title=title, doc_name=doc_name,
+                    existing_content=existing_content,
+                )},
+            ], f"update:{name}")
+        return name, page_content, True
 
-    sys.stdout.write(f"    Generating {len(concepts_list)} concept(s) (concurrency={max_concurrency})...\n")
-    sys.stdout.flush()
+    tasks = []
+    tasks.extend(_gen_create(c) for c in create_items)
+    tasks.extend(_gen_update(c) for c in update_items)
 
-    results = await asyncio.gather(
-        *[_gen_concept(c) for c in concepts_list],
-        return_exceptions=True,
-    )
+    concept_names: list[str] = []
 
-    concept_names = []
-    for r in results:
-        if isinstance(r, Exception):
-            logger.warning("Concept generation failed: %s", r)
-            continue
-        name, page_content, is_update = r
-        _write_concept(wiki_dir, name, page_content, source_file, is_update)
-        concept_names.append(name)
+    if tasks:
+        total = len(tasks)
+        sys.stdout.write(f"    Generating {total} concept(s) (concurrency={max_concurrency})...\n")
+        sys.stdout.flush()
+
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        for r in results:
+            if isinstance(r, Exception):
+                logger.warning("Concept generation failed: %s", r)
+                continue
+            name, page_content, is_update = r
+            _write_concept(wiki_dir, name, page_content, source_file, is_update)
+            concept_names.append(name)
+
+    # --- Step 3b: Process related items (code only, no LLM) ---
+    for slug in related_items:
+        _add_related_link(wiki_dir, slug, doc_name, source_file)
 
     # --- Step 4: Update index (code only) ---
     _update_index(wiki_dir, doc_name, concept_names)
 
 
-async def compile_long_doc(
+async def compile_short_doc(
     doc_name: str,
-    summary_path: Path,
-    doc_id: str,
+    source_path: Path,
     kb_dir: Path,
     model: str,
     max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
 ) -> None:
-    """Compile a long (PageIndex) document's concepts and index.
+    """Compile a short document using a multi-step LLM pipeline with caching.
 
-    The summary page is already written by the indexer. This function
-    generates concept pages and updates the index.
+    Step 1: Build base context A (schema + doc content), generate summary.
+    Steps 2-4: Delegated to ``_compile_concepts``.
     """
     from openkb.config import load_config
 
@@ -527,84 +544,63 @@ async def compile_long_doc(
     wiki_dir = kb_dir / "wiki"
     schema_md = get_agents_md(wiki_dir)
     source_file = _find_source_filename(doc_name, kb_dir)
-    summary = summary_path.read_text(encoding="utf-8")
+    content = source_path.read_text(encoding="utf-8")
 
-    # Base context A
+    # Base context A: system + document
     system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format(
         schema_md=schema_md, language=language,
     )}
-    doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format(
-        doc_name=doc_name, doc_id=doc_id, content=summary,
+    doc_msg = {"role": "user", "content": _SUMMARY_USER.format(
+        doc_name=doc_name, content=content,
     )}
 
-    # --- Step 1: Extract concept list ---
-    _, existing_concepts = _read_wiki_context(wiki_dir)
-
-    # Get a concise overview first (for concept generation context)
-    overview = _llm_call(model, [system_msg, doc_msg], "overview")
+    # --- Step 1: Generate summary ---
+    summary = _llm_call(model, [system_msg, doc_msg], "summary")
+    _write_summary(wiki_dir, doc_name, source_file, summary)
 
-    concepts_list_raw = _llm_call(model, [
-        system_msg,
-        doc_msg,
-        {"role": "assistant", "content": overview},
-        {"role": "user", "content": _CONCEPTS_LIST_USER.format(
-            existing_concepts=", ".join(existing_concepts) if existing_concepts else "(none yet)",
-        )},
-    ], "concepts-list", max_tokens=512)
+    # --- Steps 2-4: Concept plan → generate/update → index ---
+    await _compile_concepts(
+        wiki_dir, kb_dir, model, system_msg, doc_msg,
+        summary, doc_name, max_concurrency,
+    )
 
-    try:
-        concepts_list = _parse_json(concepts_list_raw)
-    except (json.JSONDecodeError, ValueError) as exc:
-        logger.warning("Failed to parse concepts list: %s", exc)
-        logger.debug("Raw: %s", concepts_list_raw)
-        _update_index(wiki_dir, doc_name, [])
-        return
 
-    if not concepts_list:
-        _update_index(wiki_dir, doc_name, [])
-        return
+async def compile_long_doc(
+    doc_name: str,
+    summary_path: Path,
+    doc_id: str,
+    kb_dir: Path,
+    model: str,
+    max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY,
+) -> None:
+    """Compile a long (PageIndex) document's concepts and index.
 
-    # --- Step 2: Generate concept pages concurrently ---
-    semaphore = asyncio.Semaphore(max_concurrency)
+    The summary page is already written by the indexer. This function
+    generates concept pages and updates the index.
+    """
+    from openkb.config import load_config
 
-    async def _gen_concept(concept: dict) -> tuple[str, str, bool]:
-        name = concept["name"]
-        title = concept.get("title", name)
-        is_update = concept.get("is_update", False)
-        update_instruction = (
-            "This concept page already exists. Add new information."
-            if is_update else ""
-        )
+    openkb_dir = kb_dir / ".openkb"
+    config = load_config(openkb_dir / "config.yaml")
+    language: str = config.get("language", "en")
 
-        async with semaphore:
-            page_content = await _llm_call_async(model, [
-                system_msg,
-                doc_msg,
-                {"role": "assistant", "content": overview},
-                {"role": "user", "content": _CONCEPT_PAGE_USER.format(
-                    title=title, doc_name=doc_name,
-                    update_instruction=update_instruction,
-                )},
-            ], f"concept:{name}")
+    wiki_dir = kb_dir / "wiki"
+    schema_md = get_agents_md(wiki_dir)
+    summary_content = summary_path.read_text(encoding="utf-8")
 
-        return name, page_content, is_update
+    # Base context A
+    system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format(
+        schema_md=schema_md, language=language,
+    )}
+    doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format(
+        doc_name=doc_name, doc_id=doc_id, content=summary_content,
+    )}
 
-    sys.stdout.write(f"    Generating {len(concepts_list)} concept(s) (concurrency={max_concurrency})...\n")
-    sys.stdout.flush()
+    # --- Step 1: Generate overview ---
+    overview = _llm_call(model, [system_msg, doc_msg], "overview")
 
-    results = await asyncio.gather(
-        *[_gen_concept(c) for c in concepts_list],
-        return_exceptions=True,
+    # --- Steps 2-4: Concept plan → generate/update → index ---
+    await _compile_concepts(
+        wiki_dir, kb_dir, model, system_msg, doc_msg,
+        overview, doc_name, max_concurrency,
     )
-
-    concept_names = []
-    for r in results:
-        if isinstance(r, Exception):
-            logger.warning("Concept generation failed: %s", r)
-            continue
-        name, page_content, is_update = r
-        _write_concept(wiki_dir, name, page_content, source_file, is_update)
-        concept_names.append(name)
-
-    # --- Step 3: Update index (code only) ---
-    _update_index(wiki_dir, doc_name, concept_names)