Skip to content

Commit 0a941fd

Browse files
Merge pull request #947 from tiran/scan-compiled
feat: scan source distributions for compiled code
2 parents cb016ce + 4535ff9 commit 0a941fd

2 files changed

Lines changed: 138 additions & 1 deletion

File tree

src/fromager/sources.py

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ def default_prepare_source(
543543
source_filename: pathlib.Path,
544544
version: Version,
545545
) -> tuple[pathlib.Path, bool]:
546-
"""Unpack and modify sdist sources
546+
"""Unpack, modify, and check sdist sources
547547
548548
Calls :func:`~fromager.sources.prepare_new_source` by default.
549549
"""
@@ -560,6 +560,15 @@ def default_prepare_source(
560560
source_root_dir=source_root_dir,
561561
version=version,
562562
)
563+
564+
# look for compiled code in sdist and warn the user
565+
potential_issues = scan_compiled_extensions(source_root_dir)
566+
if potential_issues:
567+
logger.warning(
568+
"scan_compiled_extensions has detected potential issues in %s",
569+
", ".join(str(f) for f in sorted(potential_issues)),
570+
)
571+
563572
return source_root_dir, is_new
564573

565574

@@ -776,3 +785,94 @@ def validate_sdist_filename(
776785
dist_name=sdist_name,
777786
dist_version=sdist_version,
778787
)
788+
789+
790+
_EXTENSION_SUFFIXES: set[str] = {
791+
".so", # Linux, BSD
792+
".dylib", # macOS
793+
".pyd", # Windows
794+
".dll", # Windows
795+
".exe", # Windows
796+
}
797+
798+
# ignore Python, configs, C, C++, CUDA, Go, JavaScript, ROCm/hip, Rust,
799+
# text files (Markdown, restructured text, HTML), TypeScripts
800+
_IGNORE_SUFFIXES: set[str] = {
801+
".c",
802+
".cc",
803+
".css",
804+
".cu",
805+
".cuh",
806+
".go",
807+
".h",
808+
".hip",
809+
".hpp",
810+
".html",
811+
".ini",
812+
".js",
813+
".md",
814+
".py",
815+
".rs",
816+
".rst",
817+
".sh",
818+
".ts",
819+
".toml",
820+
".txt",
821+
".yaml",
822+
".yml",
823+
}
824+
825+
_MAGIC_HEADERS: tuple[bytes, ...] = (
826+
b"\x7fELF", # Linux, BSD ELF file (binaries, object files)
827+
b"!<arch>\n", # ar archive (static libraries)
828+
b"!<thin>\n", # GCC thin ar archive
829+
b"\xfe\xed\xfa\xcf", # macOS Mach-O 64-bit
830+
b"\xfe\xed\xfa\xce", # macOS Mach-O 32-bit
831+
b"\xcf\xfa\xed\xfe", # macOS Mach-O 64-bit (little-endian)
832+
b"\xce\xfa\xed\xfe", # macOS Mach-O 32-bit (little-endian)
833+
b"\xca\xfe\xba\xbe", # macOS universal binary
834+
b"MZ", # Windows executable (usually have dll, pyd, or exe file suffix)
835+
)
836+
_MAGIC_HEADERS_READ: int = max(len(header) for header in _MAGIC_HEADERS)
837+
838+
839+
def scan_compiled_extensions(
840+
root_dir: pathlib.Path,
841+
*,
842+
extension_suffixes: set[str] = _EXTENSION_SUFFIXES,
843+
ignore_suffixes: set[str] = _IGNORE_SUFFIXES,
844+
magic_headers: tuple[bytes, ...] = _MAGIC_HEADERS,
845+
) -> list[pathlib.Path]:
846+
"""Scan directory tree for compiled code
847+
848+
Detect files that have an extension suffix or magic header.
849+
850+
Returns a list of files with potential issues. The paths are relative
851+
to *root_dir*.
852+
853+
.. warning::
854+
855+
The function is not designed to detect supply chain attacks or
856+
malicious code. It's merely a helper to detect packaging issues.
857+
"""
858+
issues: list[pathlib.Path] = []
859+
for directory, _, filenames in root_dir.walk():
860+
for filename in filenames:
861+
filepath = directory / filename
862+
suffix = filepath.suffix
863+
if suffix in extension_suffixes:
864+
relpath = filepath.relative_to(root_dir)
865+
logger.debug("file %s has a binary extension suffix", relpath)
866+
issues.append(relpath)
867+
elif suffix not in ignore_suffixes:
868+
with filepath.open("rb") as f:
869+
header = f.read(_MAGIC_HEADERS_READ)
870+
if header.startswith(magic_headers):
871+
relpath = filepath.relative_to(root_dir)
872+
logger.debug(
873+
"file %s starts with an executable file magic header: %r",
874+
relpath,
875+
header,
876+
)
877+
issues.append(relpath)
878+
return issues

tests/test_sources.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pathlib
2+
import sys
23
import typing
34
from unittest.mock import Mock, patch
45

@@ -275,3 +276,39 @@ def test_validate_sdist_file(
275276
else:
276277
with pytest.raises(ValueError):
277278
sources.validate_sdist_filename(req, version, sdist_file)
279+
280+
281+
# read header of Python executable
282+
with open(sys.executable, "rb") as _f:
283+
_EXEC_HEADER = _f.read(8)
284+
285+
286+
@pytest.mark.parametrize(
287+
"filename,content,hit",
288+
[
289+
("test.py", b"#!/usr/bin/python", False),
290+
("test.so", b"ignore", True),
291+
("test", _EXEC_HEADER, True),
292+
# assume that packages do not disguise compiled code as .py files.
293+
# A malicious actor can use more elaborate tricks to hide bad code.
294+
("test.py", _EXEC_HEADER, False),
295+
# ar archive (static library)
296+
("libfoo.a", b"!<arch>\n", True),
297+
# thin ar archive
298+
("libfoo.a", b"!<thin>\n", True),
299+
# Mach-O little-endian
300+
("test", b"\xcf\xfa\xed\xfe", True),
301+
("test", b"\xce\xfa\xed\xfe", True),
302+
],
303+
)
304+
def test_scan_compiled_extensions(
305+
filename: str, content: bytes, hit: bool, tmp_path: pathlib.Path
306+
) -> None:
307+
filepath = tmp_path / filename
308+
with filepath.open("wb") as f:
309+
f.write(content)
310+
matches = sources.scan_compiled_extensions(tmp_path)
311+
if hit:
312+
assert matches == [pathlib.Path(filename)]
313+
else:
314+
assert matches == []

0 commit comments

Comments
 (0)