Skip to content

Commit 1a28c11

Browse files
committed
fix: code review fixes — security, robustness, tests, and CI hardening
- Restore markitdown[all] extras for docx/pptx/xlsx support - Sanitize concept names to prevent path traversal in compiler - Add path traversal guard in copy_relative_images - Fix _write_concept duplicate append when frontmatter lacks sources key - Remove dead write_wiki_files function - Fix watcher thread race in _schedule_flush - Warn when unimplemented --fix flag is used in lint command - Harden CI publish workflow with environment gate and SHA-pinned actions - Fix test_indexer to actually assert IndexConfig flag values - Fix test_converter to test correct PDF code path (pymupdf, not markitdown) - Use str.find() instead of str.index() in frontmatter parsing to avoid ValueError
1 parent 4249d53 commit 1a28c11

9 files changed

Lines changed: 46 additions & 77 deletions

File tree

.github/workflows/publish.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,13 @@ on:
88
jobs:
99
publish:
1010
runs-on: ubuntu-latest
11+
environment: pypi
1112
permissions:
1213
id-token: write
1314
steps:
14-
- uses: actions/checkout@v4
15+
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.2.2
1516

16-
- uses: actions/setup-python@v5
17+
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
1718
with:
1819
python-version: "3.12"
1920

@@ -24,4 +25,4 @@ jobs:
2425
run: python -m build
2526

2627
- name: Publish to PyPI
27-
uses: pypa/gh-action-pypi-publish@release/v1
28+
uses: pypa/gh-action-pypi-publish@fb13cb306901256ace3dab689990e13a5550ffaa # release/v1.11.0

openkb/agent/compiler.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -302,14 +302,15 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
302302
existing = path.read_text(encoding="utf-8")
303303
if source_file not in existing:
304304
if existing.startswith("---"):
305-
end = existing.index("---", 3)
306-
fm = existing[:end + 3]
307-
body = existing[end + 3:]
308-
if "sources:" in fm:
309-
fm = fm.replace("sources: [", f"sources: [{source_file}, ")
310-
else:
311-
fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
312-
existing = fm + body
305+
end = existing.find("---", 3)
306+
if end != -1:
307+
fm = existing[:end + 3]
308+
body = existing[end + 3:]
309+
if "sources:" in fm:
310+
fm = fm.replace("sources: [", f"sources: [{source_file}, ")
311+
else:
312+
fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
313+
existing = fm + body
313314
else:
314315
existing = f"---\nsources: [{source_file}]\n---\n\n" + existing
315316
existing += f"\n\n{content}"
@@ -334,14 +335,15 @@ def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_f
334335
# Update sources in frontmatter
335336
if source_file not in text:
336337
if text.startswith("---"):
337-
end = text.index("---", 3)
338-
fm = text[:end + 3]
339-
body = text[end + 3:]
340-
if "sources:" in fm:
341-
fm = fm.replace("sources: [", f"sources: [{source_file}, ")
342-
else:
343-
fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
344-
text = fm + body
338+
end = text.find("---", 3)
339+
if end != -1:
340+
fm = text[:end + 3]
341+
body = text[end + 3:]
342+
if "sources:" in fm:
343+
fm = fm.replace("sources: [", f"sources: [{source_file}, ")
344+
else:
345+
fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1)
346+
text = fm + body
345347
else:
346348
text = f"---\nsources: [{source_file}]\n---\n\n" + text
347349

openkb/agent/tools.py

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -73,41 +73,3 @@ def write_wiki_file(path: str, content: str, wiki_root: str) -> str:
7373
full_path.write_text(content, encoding="utf-8")
7474
return f"Written: {path}"
7575

76-
77-
def write_wiki_files(files_json: str, wiki_root: str) -> str:
78-
"""Write multiple Markdown files to the wiki in one call.
79-
80-
Args:
81-
files_json: JSON array of objects, each with ``"path"`` and ``"content"`` keys.
82-
Example: ``[{"path": "concepts/foo.md", "content": "# Foo\\n..."}]``
83-
wiki_root: Absolute path to the wiki root directory.
84-
85-
Returns:
86-
Summary of written files, or error message on failure.
87-
"""
88-
import json
89-
90-
try:
91-
files = json.loads(files_json)
92-
except json.JSONDecodeError as exc:
93-
return f"Invalid JSON: {exc}"
94-
95-
if not isinstance(files, list):
96-
return "Expected a JSON array of {path, content} objects."
97-
98-
root = Path(wiki_root).resolve()
99-
written: list[str] = []
100-
for entry in files:
101-
path = entry.get("path", "")
102-
content = entry.get("content", "")
103-
if not path:
104-
continue
105-
full_path = (root / path).resolve()
106-
if not full_path.is_relative_to(root):
107-
written.append(f"Skipped (path escape): {path}")
108-
continue
109-
full_path.parent.mkdir(parents=True, exist_ok=True)
110-
full_path.write_text(content, encoding="utf-8")
111-
written.append(path)
112-
113-
return f"Written {len(written)} files: {', '.join(written)}"

openkb/cli.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,9 +334,11 @@ def on_new_files(paths):
334334

335335

336336
@cli.command()
337-
@click.option("--fix", is_flag=True, default=False, help="Automatically fix lint issues.") # TODO: --fix not yet implemented
337+
@click.option("--fix", is_flag=True, default=False, help="Automatically fix lint issues (not yet implemented).")
338338
def lint(fix):
339339
"""Lint the knowledge base for structural and semantic inconsistencies."""
340+
if fix:
341+
click.echo("Warning: --fix is not yet implemented. Running lint in report-only mode.")
340342
kb_dir = _find_kb_dir()
341343
if kb_dir is None:
342344
click.echo("No knowledge base found. Run `openkb init` first.")

openkb/images.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,10 @@ def copy_relative_images(
171171

172172
for match in _RELATIVE_RE.finditer(markdown):
173173
alt, rel_path = match.group(1), match.group(2)
174-
src = source_dir / rel_path
174+
src = (source_dir / rel_path).resolve()
175+
if not src.is_relative_to(source_dir.resolve()):
176+
logger.warning("Image path escapes source dir: %s; skipping.", rel_path)
177+
continue
175178
if not src.exists():
176179
logger.warning(
177180
"Relative image not found: %s; leaving original link.", src

openkb/watcher.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,12 @@ def __init__(self, callback: Callable[[list[str]], None], debounce_seconds: floa
3737

3838
def _schedule_flush(self) -> None:
3939
"""Cancel any existing timer and start a fresh debounce timer."""
40-
if self._timer is not None:
41-
self._timer.cancel()
42-
self._timer = threading.Timer(self._debounce_seconds, self._flush)
43-
self._timer.daemon = True
44-
self._timer.start()
40+
with self._lock:
41+
if self._timer is not None:
42+
self._timer.cancel()
43+
self._timer = threading.Timer(self._debounce_seconds, self._flush)
44+
self._timer.daemon = True
45+
self._timer.start()
4546

4647
def _flush(self) -> None:
4748
"""Call the callback with all collected pending paths, then clear."""

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ classifiers = [
2727
keywords = ["ai", "rag", "retrieval", "knowledge-base", "llm", "pageindex", "agents", "document"]
2828
dependencies = [
2929
"pageindex==0.3.0.dev0",
30-
"markitdown",
30+
"markitdown[all]",
3131
"click>=8.0",
3232
"watchdog>=3.0",
3333
"litellm",

tests/test_converter.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -81,27 +81,24 @@ def test_md_raw_file_copied(self, kb_dir):
8181

8282

8383
class TestConvertDocumentPdfShort:
84-
def test_short_pdf_converted_via_markitdown(self, kb_dir, tmp_path):
85-
"""PDF under threshold is converted with markitdown."""
84+
def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path):
85+
"""PDF under threshold is converted with pymupdf (convert_pdf_with_images)."""
8686
src = tmp_path / "short.pdf"
8787
src.write_bytes(b"%PDF-1.4 fake content")
8888

89-
fake_result = MagicMock()
90-
fake_result.text_content = "# Short PDF\n\nConverted content."
91-
9289
with (
9390
patch("openkb.converter.pymupdf.open") as mock_mu,
94-
patch("openkb.converter.MarkItDown") as mock_mid_cls,
91+
patch("openkb.converter.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi,
9592
):
9693
fake_doc = MagicMock()
9794
fake_doc.page_count = 5 # below default threshold of 20
9895
fake_doc.__enter__ = MagicMock(return_value=fake_doc)
9996
fake_doc.__exit__ = MagicMock(return_value=False)
10097
mock_mu.return_value = fake_doc
101-
mock_mid_cls.return_value.convert.return_value = fake_result
10298

10399
result = convert_document(src, kb_dir)
104100

101+
mock_cpwi.assert_called_once()
105102
assert result.skipped is False
106103
assert result.is_long_doc is False
107104
assert result.source_path is not None

tests/test_indexer.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,10 +95,11 @@ def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_pat
9595
with patch("openkb.indexer.PageIndexClient", return_value=fake_client) as mock_cls:
9696
index_long_document(pdf_path, kb_dir)
9797

98-
# Verify PageIndexClient was instantiated
98+
# Verify PageIndexClient was instantiated with correct IndexConfig
9999
mock_cls.assert_called_once()
100-
# Check that index_config with correct flags was passed
101100
_, kwargs = mock_cls.call_args
102-
ic = kwargs.get("index_config") or mock_cls.call_args[0][0] if mock_cls.call_args[0] else None
103-
# Either as positional or keyword — either way PageIndexClient was called
104-
assert mock_cls.called
101+
ic = kwargs.get("index_config")
102+
assert ic is not None, "index_config must be passed to PageIndexClient"
103+
assert ic.if_add_node_text is True
104+
assert ic.if_add_node_summary is True
105+
assert ic.if_add_doc_description is True

0 commit comments

Comments
 (0)