From 0bac8ad95de57fddeb3ebd96407aaef9afe9f30f Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Tue, 3 Mar 2026 13:59:16 +0100 Subject: [PATCH] feat: scan source distributions for compiled code The new helper function `scan_compiled_extensions` scans source distributions for compiled code. It detects common extensions like `.so` and `.dylib` as well as files with certain headers. The function is designed to detect packaging issues like sdists with pre-compiled code. It is incapable of detecting supply chain attacks and malicious code. Signed-off-by: Christian Heimes --- src/fromager/sources.py | 80 +++++++++++++++++++++++++++++++++++++++++ tests/test_sources.py | 30 ++++++++++++++++ 2 files changed, 110 insertions(+) diff --git a/src/fromager/sources.py b/src/fromager/sources.py index 3fc93e93..bcff3b2b 100644 --- a/src/fromager/sources.py +++ b/src/fromager/sources.py @@ -607,6 +607,9 @@ def build_sdist( sdist_root_dir=sdist_root_dir, build_env=build_env, ) + # look for compiled code in sdist + scan_compiled_extensions(sdist_root_dir) + if req.url: # The default approach to making an sdist is to make a tarball from the # source directory, since most of the time we got the source directory @@ -775,3 +778,80 @@ def validate_sdist_filename( dist_name=sdist_name, dist_version=sdist_version, ) + + +_EXTENSION_SUFFIXES: set[str] = ( + ".so", # Linux, BSD + ".dylib", # macOS + ".pyd", # Windows + ".dll", # Windows + ".exe", # Windows +) + +# ignore Python, configs, C, C++, CUDA, Go, Rust, text files +_IGNORE_SUFFIXES: set[str] = { + ".c", + ".cc", + ".cu", + ".go", + ".h", + ".ini", + ".md", + ".py", + ".rs", + ".rst", + ".sh", + ".toml", + ".txt", + ".yaml", +} + +_MAGIC_HEADERS: tuple[bytes] = ( + b"\x7fELF", # Linux, BSD ELF + b"MZ", # Windows executable + b"\xfe\xed\xfa\xcf", # macOS 64-bit + b"\xfe\xed\xfa\xce", # macOS 32-bit + b"\xca\xfe\xba\xbe", # macOS universal +) + + +def scan_compiled_extensions( + root_dir: pathlib.Path, + *, + extension_suffixes: set[str] = _EXTENSION_SUFFIXES, + ignore_suffixes: set[str] = _IGNORE_SUFFIXES, + warn: bool = True, +) -> list[pathlib.Path]: + """Scan directory tree for compiled code + + Detect files that have an extension suffix or magic header. + + .. warning:: + + The function is not designed to detect supply chain attacks or + malicious code. It's merely a helper to detect packaging issues. + """ + issues: list[pathlib.Path] = [] + for directory, _, filenames in root_dir.walk(): + for filename in filenames: + filepath = directory / filename + suffix = filepath.suffix + if suffix in extension_suffixes: + if warn: + logger.warning( + "file %s has a binary extension suffix", + filepath.relative_to(root_dir), + ) + issues.append(filepath) + elif suffix not in ignore_suffixes: + with filepath.open("rb") as f: + header = f.read(4) + if header.startswith(_MAGIC_HEADERS): + if warn: + logger.warning( + "file %s starts with an executable file magic header: %r", + filepath.relative_to(root_dir), + header, + ) + issues.append(filepath) + return issues diff --git a/tests/test_sources.py b/tests/test_sources.py index 9e216a25..aa9bdcfe 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -1,4 +1,5 @@ import pathlib +import sys import typing from unittest.mock import Mock, patch @@ -218,3 +219,32 @@ def test_validate_sdist_file( else: with pytest.raises(ValueError): sources.validate_sdist_filename(req, version, sdist_file) + + +# read header of Python executable +with open(sys.executable, "rb") as _f: + _EXEC_HEADER = _f.read(8) + + +@pytest.mark.parametrize( + "filename,content,hit", + [ + ("test.py", b"#!/usr/bin/python", False), + ("test.so", b"ignore", True), + ("test", _EXEC_HEADER, True), + # assume that packages do not disguise compiled code as .py files. + # A malicious actor can use more elaborate tricks to hide bad code. + ("test.py", _EXEC_HEADER, False), + ], +) +def test_scan_compiled_extensions( + filename: str, content: bytes, hit: bool, tmp_path: pathlib.Path +) -> None: + filepath = tmp_path / filename + with filepath.open("wb") as f: + f.write(content) + matches = sources.scan_compiled_extensions(tmp_path) + if hit: + assert matches == [filepath] + else: + assert matches == []