Skip to content

Commit a496d04

Browse files
committed
merge: resolve conflicts with origin/dev
Accept all origin/dev changes including: image support in query agent, robust JSON parsing with json_repair, unicode concept name support, section-based index operations, cloud/local page extraction fallback.
2 parents 634b212 + b77e95d commit a496d04

18 files changed

Lines changed: 477 additions & 171 deletions

openkb/agent/compiler.py

Lines changed: 121 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import sys
1717
import threading
1818
import time
19+
import unicodedata
1920
from pathlib import Path
2021

2122
import litellm
@@ -215,14 +216,18 @@ async def _llm_call_async(model: str, messages: list[dict], step_name: str) -> s
215216

216217

217218
def _parse_json(text: str) -> list | dict:
218-
"""Parse JSON from LLM response, stripping markdown fences if present."""
219+
"""Parse JSON from LLM response, handling fences, prose, and malformed JSON."""
220+
from json_repair import repair_json
219221
cleaned = text.strip()
220222
if cleaned.startswith("```"):
221-
first_nl = cleaned.index("\n")
222-
cleaned = cleaned[first_nl + 1:]
223+
first_nl = cleaned.find("\n")
224+
cleaned = cleaned[first_nl + 1:] if first_nl != -1 else cleaned[3:]
223225
if cleaned.endswith("```"):
224226
cleaned = cleaned[:-3]
225-
return json.loads(cleaned.strip())
227+
result = json.loads(repair_json(cleaned.strip()))
228+
if not isinstance(result, (dict, list)):
229+
raise ValueError(f"Expected JSON object or array, got {type(result).__name__}")
230+
return result
226231

227232

228233
# ---------------------------------------------------------------------------
@@ -279,39 +284,82 @@ def _read_concept_briefs(wiki_dir: Path) -> str:
279284
return "\n".join(lines) or "(none yet)"
280285

281286

282-
def _find_source_filename(doc_name: str, kb_dir: Path) -> str:
283-
"""Find the original filename in raw/ for a given doc stem."""
284-
raw_dir = kb_dir / "raw"
285-
if raw_dir.exists():
286-
for f in raw_dir.iterdir():
287-
if f.stem == doc_name:
288-
return f.name
289-
return f"{doc_name}.pdf"
290-
291-
292-
def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str,
293-
brief: str = "", doc_type: str = "short") -> None:
294-
"""Write summary page with frontmatter.
295-
296-
For short docs, includes a ``source_doc`` field linking to the full
297-
source text in ``sources/{doc_name}.md``.
298-
"""
287+
def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | None:
288+
"""Return the [start, end) bounds for a Markdown H2 section."""
289+
for i, line in enumerate(lines):
290+
if line == heading:
291+
start = i + 1
292+
end = len(lines)
293+
for j in range(start, len(lines)):
294+
if lines[j].startswith("## "):
295+
end = j
296+
break
297+
return start, end
298+
return None
299+
300+
301+
def _section_contains_link(lines: list[str], heading: str, link: str) -> bool:
302+
"""Check whether an index entry already exists inside the named section."""
303+
bounds = _get_section_bounds(lines, heading)
304+
if bounds is None:
305+
return False
306+
307+
start, end = bounds
308+
entry_prefix = f"- {link}"
309+
return any(line.startswith(entry_prefix) for line in lines[start:end])
310+
311+
312+
def _replace_section_entry(lines: list[str], heading: str, link: str, entry: str) -> bool:
313+
"""Replace the first matching entry within a specific section."""
314+
bounds = _get_section_bounds(lines, heading)
315+
if bounds is None:
316+
return False
317+
318+
start, end = bounds
319+
entry_prefix = f"- {link}"
320+
for i in range(start, end):
321+
if lines[i].startswith(entry_prefix):
322+
lines[i] = entry
323+
return True
324+
return False
325+
326+
327+
def _insert_section_entry(lines: list[str], heading: str, entry: str) -> bool:
328+
"""Insert a new entry at the top of a specific section."""
329+
bounds = _get_section_bounds(lines, heading)
330+
if bounds is None:
331+
return False
332+
333+
start, _ = bounds
334+
lines.insert(start, entry)
335+
return True
336+
337+
338+
339+
def _write_summary(wiki_dir: Path, doc_name: str, summary: str,
340+
doc_type: str = "short") -> None:
341+
"""Write summary page with frontmatter."""
342+
if summary.startswith("---"):
343+
end = summary.find("---", 3)
344+
if end != -1:
345+
summary = summary[end + 3:].lstrip("\n")
299346
summaries_dir = wiki_dir / "summaries"
300347
summaries_dir.mkdir(parents=True, exist_ok=True)
301-
fm_lines = [f"sources: [{source_file}]"]
302-
if brief:
303-
fm_lines.append(f"brief: {brief}")
304-
if doc_type == "short":
305-
fm_lines.append(f"source_doc: sources/{doc_name}.md")
348+
ext = "md" if doc_type == "short" else "json"
349+
fm_lines = [
350+
f"doc_type: {doc_type}",
351+
f"full_text: sources/{doc_name}.{ext}",
352+
]
306353
frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
307354
(summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8")
308355

309356

310-
_SAFE_NAME_RE = re.compile(r'[^a-zA-Z0-9_\-]')
357+
_SAFE_NAME_RE = re.compile(r'[^\w\-]')
311358

312359

313360
def _sanitize_concept_name(name: str) -> str:
314361
"""Sanitize a concept name for safe use as a filename."""
362+
name = unicodedata.normalize("NFKC", name)
315363
sanitized = _SAFE_NAME_RE.sub("-", name).strip("-")
316364
return sanitized or "unnamed-concept"
317365

@@ -341,7 +389,21 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
341389
existing = fm + body
342390
else:
343391
existing = f"---\nsources: [{source_file}]\n---\n\n" + existing
344-
existing += f"\n\n{content}"
392+
# Strip frontmatter from LLM content to avoid duplicate blocks
393+
clean = content
394+
if clean.startswith("---"):
395+
end = clean.find("---", 3)
396+
if end != -1:
397+
clean = clean[end + 3:].lstrip("\n")
398+
# Replace body with LLM rewrite (prompt asks for full rewrite, not delta)
399+
if existing.startswith("---"):
400+
end = existing.find("---", 3)
401+
if end != -1:
402+
existing = existing[:end + 3] + "\n\n" + clean
403+
else:
404+
existing = clean
405+
else:
406+
existing = clean
345407
if brief and existing.startswith("---"):
346408
end = existing.find("---", 3)
347409
if end != -1:
@@ -354,6 +416,10 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
354416
existing = fm + body
355417
path.write_text(existing, encoding="utf-8")
356418
else:
419+
if content.startswith("---"):
420+
end = content.find("---", 3)
421+
if end != -1:
422+
content = content[end + 3:].lstrip("\n")
357423
fm_lines = [f"sources: [{source_file}]"]
358424
if brief:
359425
fm_lines.append(f"brief: {brief}")
@@ -445,7 +511,6 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str])
445511
text += f"\n\n## Related Documents\n- {link}\n"
446512
path.write_text(text, encoding="utf-8")
447513

448-
449514
def _update_index(
450515
wiki_dir: Path, doc_name: str, concept_names: list[str],
451516
doc_brief: str = "", concept_briefs: dict[str, str] | None = None,
@@ -454,8 +519,9 @@ def _update_index(
454519
"""Append document and concept entries to index.md.
455520
456521
When ``doc_brief`` or entries in ``concept_briefs`` are provided, entries
457-
are written as ``- [[link]] (type) — brief text``. Existing entries are
458-
detected by the link part only, so updating a brief on a re-compile works.
522+
are written as ``- [[link]] (type) — brief text``. Existing entries are
523+
detected within their own section by exact entry prefix and skipped to
524+
avoid duplicates.
459525
``doc_type`` is ``"short"`` or ``"pageindex"`` — shown in the entry so the
460526
query agent knows how to access detailed content.
461527
"""
@@ -469,26 +535,27 @@ def _update_index(
469535
encoding="utf-8",
470536
)
471537

472-
text = index_path.read_text(encoding="utf-8")
538+
lines = index_path.read_text(encoding="utf-8").split("\n")
473539

474540
doc_link = f"[[summaries/{doc_name}]]"
475-
if doc_link not in text:
541+
if not _section_contains_link(lines, "## Documents", doc_link):
476542
doc_entry = f"- {doc_link} ({doc_type})"
477543
if doc_brief:
478544
doc_entry += f" — {doc_brief}"
479-
if "## Documents" in text:
480-
text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1)
545+
_insert_section_entry(lines, "## Documents", doc_entry)
481546

482547
for name in concept_names:
483548
concept_link = f"[[concepts/{name}]]"
484-
if concept_link not in text:
485-
concept_entry = f"- {concept_link}"
549+
concept_entry = f"- {concept_link}"
550+
if name in concept_briefs:
551+
concept_entry += f" — {concept_briefs[name]}"
552+
if _section_contains_link(lines, "## Concepts", concept_link):
486553
if name in concept_briefs:
487-
concept_entry += f" — {concept_briefs[name]}"
488-
if "## Concepts" in text:
489-
text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1)
554+
_replace_section_entry(lines, "## Concepts", concept_link, concept_entry)
555+
else:
556+
_insert_section_entry(lines, "## Concepts", concept_entry)
490557

491-
index_path.write_text(text, encoding="utf-8")
558+
index_path.write_text("\n".join(lines), encoding="utf-8")
492559

493560

494561
# ---------------------------------------------------------------------------
@@ -515,7 +582,7 @@ async def _compile_concepts(
515582
Uses ``_CONCEPTS_PLAN_USER`` to get a plan with create/update/related
516583
actions, then executes each action type accordingly.
517584
"""
518-
source_file = _find_source_filename(doc_name, kb_dir)
585+
source_file = f"summaries/{doc_name}.md"
519586

520587
# --- Step 2: Get concepts plan (A cached) ---
521588
concept_briefs = _read_concept_briefs(wiki_dir)
@@ -534,7 +601,7 @@ async def _compile_concepts(
534601
except (json.JSONDecodeError, ValueError) as exc:
535602
logger.warning("Failed to parse concepts plan: %s", exc)
536603
logger.debug("Raw: %s", plan_raw)
537-
_update_index(wiki_dir, doc_name, [])
604+
_update_index(wiki_dir, doc_name, [], doc_brief=doc_brief, doc_type=doc_type)
538605
return
539606

540607
# Fallback: if LLM returns a flat list, treat all items as "create"
@@ -552,7 +619,7 @@ async def _compile_concepts(
552619
related_items = plan["related"]
553620

554621
if not create_items and not update_items and not related_items:
555-
_update_index(wiki_dir, doc_name, [])
622+
_update_index(wiki_dir, doc_name, [], doc_brief=doc_brief, doc_type=doc_type)
556623
return
557624

558625
# --- Step 3: Generate/update concept pages concurrently (A cached) ---
@@ -570,7 +637,7 @@ async def _gen_create(concept: dict) -> tuple[str, str, bool, str]:
570637
title=title, doc_name=doc_name,
571638
update_instruction="",
572639
)},
573-
], f"concept:{name}")
640+
], f"concept: {name}")
574641
try:
575642
parsed = _parse_json(raw)
576643
brief = parsed.get("brief", "")
@@ -582,7 +649,7 @@ async def _gen_create(concept: dict) -> tuple[str, str, bool, str]:
582649
async def _gen_update(concept: dict) -> tuple[str, str, bool, str]:
583650
name = concept["name"]
584651
title = concept.get("title", name)
585-
concept_path = wiki_dir / "concepts" / f"{name}.md"
652+
concept_path = wiki_dir / "concepts" / f"{_sanitize_concept_name(name)}.md"
586653
if concept_path.exists():
587654
raw_text = concept_path.read_text(encoding="utf-8")
588655
if raw_text.startswith("---"):
@@ -601,7 +668,7 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]:
601668
title=title, doc_name=doc_name,
602669
existing_content=existing_content,
603670
)},
604-
], f"update:{name}")
671+
], f"update: {name}")
605672
try:
606673
parsed = _parse_json(raw)
607674
brief = parsed.get("brief", "")
@@ -630,16 +697,18 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]:
630697
continue
631698
name, page_content, is_update, brief = r
632699
_write_concept(wiki_dir, name, page_content, source_file, is_update, brief=brief)
633-
concept_names.append(name)
700+
safe_name = _sanitize_concept_name(name)
701+
concept_names.append(safe_name)
634702
if brief:
635-
concept_briefs_map[name] = brief
703+
concept_briefs_map[safe_name] = brief
636704

637705
# --- Step 3b: Process related items (code only, no LLM) ---
638-
for slug in related_items:
706+
sanitized_related = [_sanitize_concept_name(s) for s in related_items]
707+
for slug in sanitized_related:
639708
_add_related_link(wiki_dir, slug, doc_name, source_file)
640709

641710
# --- Step 3c: Backlink — summary ↔ concepts (code only) ---
642-
all_concept_slugs = concept_names + [s for s in related_items]
711+
all_concept_slugs = concept_names + sanitized_related
643712
if all_concept_slugs:
644713
_backlink_summary(wiki_dir, doc_name, all_concept_slugs)
645714
_backlink_concepts(wiki_dir, doc_name, all_concept_slugs)
@@ -670,7 +739,6 @@ async def compile_short_doc(
670739

671740
wiki_dir = kb_dir / "wiki"
672741
schema_md = get_agents_md(wiki_dir)
673-
source_file = _find_source_filename(doc_name, kb_dir)
674742
content = source_path.read_text(encoding="utf-8")
675743

676744
# Base context A: system + document
@@ -690,7 +758,7 @@ async def compile_short_doc(
690758
except (json.JSONDecodeError, ValueError):
691759
doc_brief = ""
692760
summary = summary_raw
693-
_write_summary(wiki_dir, doc_name, source_file, summary, brief=doc_brief)
761+
_write_summary(wiki_dir, doc_name, summary)
694762

695763
# --- Steps 2-4: Concept plan → generate/update → index ---
696764
await _compile_concepts(

openkb/agent/linter.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from agents import Agent, Runner, function_tool
77

88
from openkb.agent.tools import list_wiki_files, read_wiki_file
9+
10+
MAX_TURNS = 50
911
from openkb.schema import SCHEMA_MD, get_agents_md
1012

1113
_LINTER_INSTRUCTIONS_TEMPLATE = """\
@@ -102,5 +104,5 @@ async def run_knowledge_lint(kb_dir: Path, model: str) -> str:
102104
"Produce a structured Markdown report."
103105
)
104106

105-
result = await Runner.run(agent, prompt)
107+
result = await Runner.run(agent, prompt, max_turns=MAX_TURNS)
106108
return result.final_output or "Knowledge lint completed. No output produced."

0 commit comments

Comments
 (0)