@@ -543,7 +543,7 @@ def default_prepare_source(
543543 source_filename : pathlib .Path ,
544544 version : Version ,
545545) -> tuple [pathlib .Path , bool ]:
546- """Unpack and modify sdist sources
546+ """Unpack, modify, and check sdist sources
547547
548548 Calls :func:`~fromager.sources.prepare_new_source` by default.
549549 """
@@ -560,6 +560,15 @@ def default_prepare_source(
560560 source_root_dir = source_root_dir ,
561561 version = version ,
562562 )
563+
564+ # look for compiled code in sdist and warn the user
565+ potential_issues = scan_compiled_extensions (source_root_dir )
566+ if potential_issues :
567+ logger .warning (
568+ "scan_compiled_extensions has detected potential issues in %s" ,
569+ ", " .join (str (f ) for f in sorted (potential_issues )),
570+ )
571+
563572 return source_root_dir , is_new
564573
565574
@@ -776,3 +785,94 @@ def validate_sdist_filename(
776785 dist_name = sdist_name ,
777786 dist_version = sdist_version ,
778787 )
788+
789+
790+ _EXTENSION_SUFFIXES : set [str ] = {
791+ ".so" , # Linux, BSD
792+ ".dylib" , # macOS
793+ ".pyd" , # Windows
794+ ".dll" , # Windows
795+ ".exe" , # Windows
796+ }
797+
798+ # ignore Python, configs, C, C++, CUDA, Go, JavaScript, ROCm/hip, Rust,
799+ # text files (Markdown, restructured text, HTML), TypeScripts
800+ _IGNORE_SUFFIXES : set [str ] = {
801+ ".c" ,
802+ ".cc" ,
803+ ".css" ,
804+ ".cu" ,
805+ ".cuh" ,
806+ ".go" ,
807+ ".h" ,
808+ ".hip" ,
809+ ".hpp" ,
810+ ".html" ,
811+ ".ini" ,
812+ ".js" ,
813+ ".md" ,
814+ ".py" ,
815+ ".rs" ,
816+ ".rst" ,
817+ ".sh" ,
818+ ".ts" ,
819+ ".toml" ,
820+ ".txt" ,
821+ ".yaml" ,
822+ ".yml" ,
823+ }
824+
825+ _MAGIC_HEADERS : tuple [bytes , ...] = (
826+ b"\x7f ELF" , # Linux, BSD ELF file (binaries, object files)
827+ b"!<arch>\n " , # ar archive (static libraries)
828+ b"!<thin>\n " , # GCC thin ar archive
829+ b"\xfe \xed \xfa \xcf " , # macOS Mach-O 64-bit
830+ b"\xfe \xed \xfa \xce " , # macOS Mach-O 32-bit
831+ b"\xcf \xfa \xed \xfe " , # macOS Mach-O 64-bit (little-endian)
832+ b"\xce \xfa \xed \xfe " , # macOS Mach-O 32-bit (little-endian)
833+ b"\xca \xfe \xba \xbe " , # macOS universal binary
834+ b"MZ" , # Windows executable (usually have dll, pyd, or exe file suffix)
835+ )
836+ _MAGIC_HEADERS_READ : int = max (len (header ) for header in _MAGIC_HEADERS )
837+
838+
839+ def scan_compiled_extensions (
840+ root_dir : pathlib .Path ,
841+ * ,
842+ extension_suffixes : set [str ] = _EXTENSION_SUFFIXES ,
843+ ignore_suffixes : set [str ] = _IGNORE_SUFFIXES ,
844+ magic_headers : tuple [bytes , ...] = _MAGIC_HEADERS ,
845+ ) -> list [pathlib .Path ]:
846+ """Scan directory tree for compiled code
847+
848+ Detect files that have an extension suffix or magic header.
849+
850+ Returns a list of files with potential issues. The paths are relative
851+ to *root_dir*.
852+
853+ .. warning::
854+
855+ The function is not designed to detect supply chain attacks or
856+ malicious code. It's merely a helper to detect packaging issues.
857+ """
858+ issues : list [pathlib .Path ] = []
859+ for directory , _ , filenames in root_dir .walk ():
860+ for filename in filenames :
861+ filepath = directory / filename
862+ suffix = filepath .suffix
863+ if suffix in extension_suffixes :
864+ relpath = filepath .relative_to (root_dir )
865+ logger .debug ("file %s has a binary extension suffix" , relpath )
866+ issues .append (relpath )
867+ elif suffix not in ignore_suffixes :
868+ with filepath .open ("rb" ) as f :
869+ header = f .read (_MAGIC_HEADERS_READ )
870+ if header .startswith (magic_headers ):
871+ relpath = filepath .relative_to (root_dir )
872+ logger .debug (
873+ "file %s starts with an executable file magic header: %r" ,
874+ relpath ,
875+ header ,
876+ )
877+ issues .append (relpath )
878+ return issues
0 commit comments