|
12 | 12 | compile_short_doc, |
13 | 13 | _compile_concepts, |
14 | 14 | _parse_json, |
| 15 | + _sanitize_concept_name, |
15 | 16 | _write_summary, |
16 | 17 | _write_concept, |
17 | 18 | _update_index, |
@@ -74,6 +75,39 @@ def test_plain_text_fallback(self): |
74 | 75 | _parse_json("Just plain markdown text without JSON") |
75 | 76 |
|
76 | 77 |
|
| 78 | +class TestSanitizeConceptName: |
| 79 | + def test_ascii_passthrough(self): |
| 80 | + assert _sanitize_concept_name("hello-world") == "hello-world" |
| 81 | + |
| 82 | + def test_spaces_replaced(self): |
| 83 | + assert _sanitize_concept_name("hello world") == "hello-world" |
| 84 | + |
| 85 | + def test_chinese(self): |
| 86 | + result = _sanitize_concept_name("注意力机制") |
| 87 | + assert result == "注意力机制" |
| 88 | + |
| 89 | + def test_japanese(self): |
| 90 | + result = _sanitize_concept_name("トランスフォーマー") |
| 91 | + assert result == "トランスフォーマー" |
| 92 | + |
| 93 | + def test_french_accents(self): |
| 94 | + result = _sanitize_concept_name("réseau neuronal") |
| 95 | + assert "r" in result |
| 96 | + assert result != "r-seau-neuronal" # accented chars preserved, not stripped |
| 97 | + |
| 98 | + def test_distinct_chinese_names_no_collision(self): |
| 99 | + a = _sanitize_concept_name("注意力机制") |
| 100 | + b = _sanitize_concept_name("变压器模型") |
| 101 | + assert a != b |
| 102 | + |
| 103 | + def test_empty_fallback(self): |
| 104 | + assert _sanitize_concept_name("!!!") == "unnamed-concept" |
| 105 | + |
| 106 | + def test_nfkc_normalization(self): |
| 107 | + # U+FF21 (fullwidth A) should normalize to regular A |
| 108 | + assert _sanitize_concept_name("\uff21\uff22") == "AB" |
| 109 | + |
| 110 | + |
77 | 111 | class TestWriteSummary: |
78 | 112 | def test_writes_with_frontmatter(self, tmp_path): |
79 | 113 | wiki = tmp_path / "wiki" |
|
0 commit comments