Skip to content

Commit 55f6dea

Browse files
committed
fix: preserve non-ASCII characters in concept name slugs
1 parent cf38295 commit 55f6dea

2 files changed

Lines changed: 37 additions & 1 deletion

File tree

openkb/agent/compiler.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import sys
1717
import threading
1818
import time
19+
import unicodedata
1920
from pathlib import Path
2021

2122
import litellm
@@ -302,11 +303,12 @@ def _write_summary(wiki_dir: Path, doc_name: str, summary: str,
302303
(summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8")
303304

304305

305-
_SAFE_NAME_RE = re.compile(r'[^a-zA-Z0-9_\-]')
306+
_SAFE_NAME_RE = re.compile(r'[^\w\-]')
306307

307308

308309
def _sanitize_concept_name(name: str) -> str:
309310
"""Sanitize a concept name for safe use as a filename."""
311+
name = unicodedata.normalize("NFKC", name)
310312
sanitized = _SAFE_NAME_RE.sub("-", name).strip("-")
311313
return sanitized or "unnamed-concept"
312314

tests/test_compiler.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
compile_short_doc,
1313
_compile_concepts,
1414
_parse_json,
15+
_sanitize_concept_name,
1516
_write_summary,
1617
_write_concept,
1718
_update_index,
@@ -74,6 +75,39 @@ def test_plain_text_fallback(self):
7475
_parse_json("Just plain markdown text without JSON")
7576

7677

78+
class TestSanitizeConceptName:
79+
def test_ascii_passthrough(self):
80+
assert _sanitize_concept_name("hello-world") == "hello-world"
81+
82+
def test_spaces_replaced(self):
83+
assert _sanitize_concept_name("hello world") == "hello-world"
84+
85+
def test_chinese(self):
86+
result = _sanitize_concept_name("注意力机制")
87+
assert result == "注意力机制"
88+
89+
def test_japanese(self):
90+
result = _sanitize_concept_name("トランスフォーマー")
91+
assert result == "トランスフォーマー"
92+
93+
def test_french_accents(self):
94+
result = _sanitize_concept_name("réseau neuronal")
95+
assert "r" in result
96+
assert result != "r-seau-neuronal" # accented chars preserved, not stripped
97+
98+
def test_distinct_chinese_names_no_collision(self):
99+
a = _sanitize_concept_name("注意力机制")
100+
b = _sanitize_concept_name("变压器模型")
101+
assert a != b
102+
103+
def test_empty_fallback(self):
104+
assert _sanitize_concept_name("!!!") == "unnamed-concept"
105+
106+
def test_nfkc_normalization(self):
107+
# U+FF21 (fullwidth A) should normalize to regular A
108+
assert _sanitize_concept_name("\uff21\uff22") == "AB"
109+
110+
77111
class TestWriteSummary:
78112
def test_writes_with_frontmatter(self, tmp_path):
79113
wiki = tmp_path / "wiki"

0 commit comments

Comments
 (0)