diff --git a/CMakeLists.txt b/CMakeLists.txt index 83bbb8fe0e..4c67194f20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,9 +40,9 @@ if (MFC_ALL) endif() # Validate CMAKE_BUILD_TYPE to catch typos (CMake is case-sensitive). -set(_VALID_BUILD_TYPES "Debug" "Release" "RelDebug" "") +set(_VALID_BUILD_TYPES "Debug" "Release" "RelDebug" "Fast" "") if (NOT CMAKE_BUILD_TYPE IN_LIST _VALID_BUILD_TYPES) - message(FATAL_ERROR "Unknown CMAKE_BUILD_TYPE '${CMAKE_BUILD_TYPE}'. Valid: Debug, RelDebug, Release") + message(FATAL_ERROR "Unknown CMAKE_BUILD_TYPE '${CMAKE_BUILD_TYPE}'. Valid: Debug, RelDebug, Release, Fast") endif() # RelDebug: a lighter debug mode for CI. Compiler-specific blocks below add the @@ -51,6 +51,13 @@ set(CMAKE_C_FLAGS_RELDEBUG "-g" CACHE STRING "") set(CMAKE_CXX_FLAGS_RELDEBUG "-g" CACHE STRING "") set(CMAKE_Fortran_FLAGS_RELDEBUG "-g" CACHE STRING "") +# Fast: fast-iteration dev builds (e.g. GPU print-debugging). Deliberately matches +# none of the Release-only (IPO, -march=native) or Debug/RelDebug-only (MFC_DEBUG, +# -gpu=debug) conditional blocks below, so it inherits none of them - just a light -O1. +set(CMAKE_C_FLAGS_FAST "-O1" CACHE STRING "") +set(CMAKE_CXX_FLAGS_FAST "-O1" CACHE STRING "") +set(CMAKE_Fortran_FLAGS_FAST "-O1" CACHE STRING "") + if (MFC_SINGLE_PRECISION) add_compile_definitions(MFC_SINGLE_PRECISION) else() @@ -330,6 +337,14 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelDebug") add_compile_definitions(MFC_DEBUG) endif() +# Fast: light optimization for dev iteration. Like Debug/RelDebug, the real opt +# flag is injected here (the CMAKE_*_FLAGS_FAST cache vars are placeholders). +# -O1 keeps compile time low while giving acceptable runtime; no MFC_DEBUG, so +# device routines stay free of host-only debug aborts and compile without IPO. +if (CMAKE_BUILD_TYPE STREQUAL "Fast") + add_compile_options($<$:-O1>) +endif() + # HANDLE_SOURCES: Given a target (herein ): @@ -676,8 +691,15 @@ exit 0 target_compile_options(${a_target} PRIVATE -fopenmp) target_link_options(${a_target} PRIVATE -fopenmp) elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") - target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -O3 -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription) - target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -flto-partitions=${MFC_BUILD_JOBS}) + if(CMAKE_BUILD_TYPE STREQUAL "Fast") + # Fast dev-iteration: -O1 + -fno-lto eliminates whole-program device LTO. + # (-fopenmp-target-jit is not yet supported by AMD flang as of 23.x.) + target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -O1 -fno-lto -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription) + target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -fno-lto) + else() + target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -O3 -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription) + target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -flto-partitions=${MFC_BUILD_JOBS}) + endif() endif() endif() @@ -746,9 +768,14 @@ exit 0 find_library(HIP_LIB amdhip64 HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED) find_library(HIPFORT_AMDGCN_LIB hipfort-amdgcn - HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED) - target_include_directories(${a_target} PRIVATE - "$ENV{OLCF_AFAR_ROOT}/include/hipfort/amdgcn") + HINTS "$ENV{OLCF_AFAR_ROOT}/lib" "$ENV{OLCF_AFAR_ROOT}/lib/llvm/lib" REQUIRED) + if(EXISTS "$ENV{OLCF_AFAR_ROOT}/lib/llvm/include/hipfort/amdgcn") + target_include_directories(${a_target} PRIVATE + "$ENV{OLCF_AFAR_ROOT}/lib/llvm/include/hipfort/amdgcn") + else() + target_include_directories(${a_target} PRIVATE + "$ENV{OLCF_AFAR_ROOT}/include/hipfort/amdgcn") + endif() target_link_libraries(${a_target} PRIVATE ${HIP_LIB} ${HIPFORT_AMDGCN_LIB}) diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 01efb1a9b1..096859a193 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -27,6 +27,53 @@ _MAKE_PROGRESS_RE = re.compile(r"^\[\s*(\d+)%\]\s+(.*)$") +def _cmake_build_type() -> str: + """Map the CLI build-mode flags to a CMAKE_BUILD_TYPE string.""" + if ARG("debug"): + return "Debug" + if ARG("reldebug"): + return "RelDebug" + if ARG("fast_build", False): + return "Fast" + return "Release" + + +def _apply_fast_build_gpu_arch() -> None: + """Under --fast-build on an NVHPC GPU build, restrict device codegen to a + single compute capability (the node's GPU), overriding the multi-arch + MFC_CUDA_CC that the module files set. CMake reads $ENV{MFC_CUDA_CC}. + + Cray/AMD GPU builds don't use MFC_CUDA_CC (they are already single-arch via + craype-accel/--offload-arch), so this only acts when MFC_CUDA_CC is set. + Hard-errors if no GPU is detectable and no explicit arch is provided.""" + if not ARG("fast_build", False) or ARG("gpu") == gpuConfigOptions.NONE.value: + return + if not os.environ.get("MFC_CUDA_CC"): # not an NVHPC node; nothing to do + return + + override = os.environ.get("MFC_FAST_ARCH") # escape hatch for login nodes + if override: + os.environ["MFC_CUDA_CC"] = override + return + + try: + result = subprocess.run( + ["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader"], + capture_output=True, + text=True, + timeout=10, + check=False, + ) + caps = [ln.strip().replace(".", "") for ln in result.stdout.splitlines() if ln.strip()] + except (OSError, subprocess.SubprocessError): + caps = [] + + if not caps: + raise MFCException("--fast-build: could not detect a local GPU compute capability " "(no GPU visible via nvidia-smi). Run on a GPU node, or set " "MFC_FAST_ARCH= (e.g. MFC_FAST_ARCH=90).") + + os.environ["MFC_CUDA_CC"] = caps[0] + + def _run_build_with_progress(command: typing.List[str], target_name: str, streaming: bool = False) -> subprocess.CompletedProcess: """ Run a build command with a progress bar that parses ninja output. @@ -367,6 +414,10 @@ def is_buildable(self) -> bool: def configure(self, case: Case): if ARG("debug") and ARG("reldebug"): raise MFCException("--debug and --reldebug are mutually exclusive.") + if ARG("fast_build", False) and (ARG("debug") or ARG("reldebug")): + raise MFCException("--fast-build is mutually exclusive with --debug/--reldebug.") + + _apply_fast_build_gpu_arch() build_dirpath = self.get_staging_dirpath(case) cmake_dirpath = self.get_cmake_dirpath() @@ -386,9 +437,9 @@ def configure(self, case: Case): # build the configured targets. This is mostly useful for debugging. # See: https://cmake.org/cmake/help/latest/variable/CMAKE_EXPORT_COMPILE_COMMANDS.html. "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON", - # Set build type (Debug, RelDebug, or Release). + # Set build type (Debug, RelDebug, Fast, or Release). # See: https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html - f"-DCMAKE_BUILD_TYPE={'Debug' if ARG('debug') else 'RelDebug' if ARG('reldebug') else 'Release'}", + f"-DCMAKE_BUILD_TYPE={_cmake_build_type()}", # Used by FIND_PACKAGE (/FindXXX) to search for packages, with the # second highest level of priority, still letting users manually # specify _ROOT, which has precedence over CMAKE_PREFIX_PATH. @@ -468,7 +519,7 @@ def build(self, case: input.MFCInputFile): "--parallel", ARG("jobs"), "--config", - "Debug" if ARG("debug") else "RelDebug" if ARG("reldebug") else "Release", + _cmake_build_type(), ] verbosity = ARG("verbose") diff --git a/toolchain/mfc/lock.py b/toolchain/mfc/lock.py index 02a8732f9b..0e33c8ccbc 100644 --- a/toolchain/mfc/lock.py +++ b/toolchain/mfc/lock.py @@ -5,7 +5,7 @@ from .printer import cons from .state import MFCConfig -MFC_LOCK_CURRENT_VERSION: int = 8 +MFC_LOCK_CURRENT_VERSION: int = 9 @dataclasses.dataclass diff --git a/toolchain/mfc/state.py b/toolchain/mfc/state.py index 94a37be947..a14393ebde 100644 --- a/toolchain/mfc/state.py +++ b/toolchain/mfc/state.py @@ -16,6 +16,7 @@ class MFCConfig: gpu: str = gpuConfigOptions.NONE.value debug: bool = False reldebug: bool = False + fast_build: bool = False gcov: bool = False unified: bool = False single: bool = False