diff --git a/.github/scripts/ci_cuda_toolkit.py b/.github/scripts/ci_cuda_toolkit.py new file mode 100644 index 0000000000..975dc7101d --- /dev/null +++ b/.github/scripts/ci_cuda_toolkit.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 + +import argparse +import os +import platform +import subprocess +from pathlib import Path + + +REQUIRED_HEADERS = ( + "cuda.h", + "nvrtc.h", + "cuda_fp16.h", + "vector_types.h", +) + + +def cuda_version() -> str: + version = os.environ.get("CUDA_VERSION", "").strip() + if not version: + raise SystemExit("CUDA_VERSION is not set.") + parts = version.split(".") + if len(parts) < 2 or not all(part.isdigit() for part in parts[:2]): + raise SystemExit(f"CUDA_VERSION must start with major.minor, got: {version}") + return version + + +def major_minor(version: str) -> str: + major, minor, *_ = version.split(".") + return f"{major}.{minor}" + + +def cache_base() -> Path: + base = os.environ.get("CUDA_CACHE_BASE", "").strip() + if not base: + raise SystemExit("CUDA_CACHE_BASE is not set.") + return Path(base) + + +def cache_root(version: str) -> str: + return str(cache_base() / f"v{major_minor(version)}") + + +def cache_key(version: str) -> str: + if platform.system() == "Windows": + return f"cuda-toolkit-{version}-windows-2025-x64-v2" + return f"cuda-toolkit-{version}-{platform.system().lower()}-x64-v1" + + +def cache_restore_key(version: str) -> str: + if platform.system() == "Windows": + return f"cuda-toolkit-{version}-windows-2025-x64-" + return f"cuda-toolkit-{version}-{platform.system().lower()}-x64-" + + +def emit_outputs() -> None: + version = cuda_version() + lines = ( + f"cache_root={cache_root(version)}", + f"cache_key={cache_key(version)}", + f"cache_restore_key={cache_restore_key(version)}", + ) + output = os.environ.get("GITHUB_OUTPUT") + if output: + with open(output, "a", encoding="utf-8") as file: + file.write("\n".join(lines)) + file.write("\n") + else: + print("\n".join(lines)) + + +def nvcc_path(root: Path) -> Path: + executable = "nvcc.exe" if platform.system() == "Windows" else "nvcc" + return root / "bin" / executable + + +def run(command: list[str], **kwargs) -> subprocess.CompletedProcess: + print("+", " ".join(command)) + return subprocess.run(command, check=False, text=True, **kwargs) + + +def verify_toolkit(root: Path, version: str) -> bool: + missing = [str(nvcc_path(root))] + missing.extend(str(root / "include" / header) for header in REQUIRED_HEADERS) + missing = [path for path in missing if not Path(path).exists()] + if missing: + print(f"CUDA Toolkit cache is incomplete at {root}.") + for path in missing: + print(f"missing: {path}") + return False + + result = run([str(nvcc_path(root)), "--version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + print(result.stdout) + expected = f"release {major_minor(version)}" + if result.returncode != 0 or expected not in result.stdout: + print(f"Expected CUDA Toolkit {major_minor(version)} at {root}.") + return False + return True + + +def verify() -> None: + version = cuda_version() + root = Path(os.environ.get("CUDA_TOOLKIT_ROOT", cache_root(version))) + if not verify_toolkit(root, version): + raise SystemExit(1) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("command", choices=("outputs", "verify")) + args = parser.parse_args() + + if args.command == "outputs": + emit_outputs() + elif args.command == "verify": + verify() + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 8a62da4fc7..d6d593ebc5 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -9,6 +9,11 @@ permissions: contents: read actions: read +env: + CUDA_VERSION: '13.2.1' + CUDA_CACHE_BASE: 'C:\nabla-ci\cuda' + CUDA_CONTAINER_ROOT: 'C:\cuda' + concurrency: group: push-lock-${{ github.ref }} cancel-in-progress: true @@ -47,9 +52,36 @@ jobs: } & $rgExe --version + prepare-host-cuda: + name: Prepare host CUDA + runs-on: windows-2025 + + steps: + - name: Checkout CUDA CI helper + uses: actions/checkout@v6 + with: + fetch-depth: 1 + sparse-checkout: | + .github/scripts + + - name: CUDA Toolkit paths + id: cuda + run: python .github/scripts/ci_cuda_toolkit.py outputs + + - name: Restore CUDA Toolkit + id: cache-cuda + uses: actions/cache@v5 + with: + path: ${{ steps.cuda.outputs.cache_root }} + key: ${{ steps.cuda.outputs.cache_key }} + restore-keys: ${{ steps.cuda.outputs.cache_restore_key }} + + - name: Verify CUDA Toolkit + run: python .github/scripts/ci_cuda_toolkit.py verify + build-windows: name: Nabla (${{ matrix.os }}, ${{ matrix.vendor }}-${{ matrix.tag }}, ${{ matrix.config }}) - needs: prepare-host-rg + needs: [prepare-host-rg, prepare-host-cuda] runs-on: ${{ matrix.os }} env: @@ -165,6 +197,10 @@ jobs: with: submodules: 'recursive' + - name: CUDA Toolkit paths + id: cuda + run: python .github/scripts/ci_cuda_toolkit.py outputs + - name: Restore ripgrep host tool id: cache-rg uses: actions/cache@v5 @@ -183,9 +219,29 @@ jobs: $rgDir | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append & $rgExe --version + - name: Restore CUDA Toolkit + id: cache-cuda + uses: actions/cache@v5 + with: + path: ${{ steps.cuda.outputs.cache_root }} + key: ${{ steps.cuda.outputs.cache_key }} + restore-keys: ${{ steps.cuda.outputs.cache_restore_key }} + + - name: Verify CUDA Toolkit + run: python .github/scripts/ci_cuda_toolkit.py verify + - name: Pull Image run: | - docker pull "${{ env.image }}:${{ matrix.tag }}" + $image = "${{ env.image }}:${{ matrix.tag }}" + for ($attempt = 1; $attempt -le 3; $attempt++) { + docker pull $image + if ($LASTEXITCODE -eq 0) { + exit 0 + } + Write-Warning "docker pull failed for $image on attempt $attempt." + Start-Sleep -Seconds (15 * $attempt) + } + exit $LASTEXITCODE - name: Run Container run: | @@ -199,8 +255,10 @@ jobs: --env-file .\docker\ci-windows.env ` --env-file .\docker\ninja.env ` --env "NSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" ` + --env "CUDA_PATH=${{ env.CUDA_CONTAINER_ROOT }}" ` --name orphan --network docker_default ` -v "${{ github.workspace }}:${{ env.mount }}" ` + -v "${{ steps.cuda.outputs.cache_root }}:${{ env.CUDA_CONTAINER_ROOT }}" ` -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" ` -w "${{ env.mount }}" ` "${{ env.image }}:${{ matrix.tag }}" ` @@ -222,6 +280,7 @@ jobs: ${{ env.entry }} ${{ env.cmd }} -Command cmake ` --preset ci-configure-dynamic-${{ matrix.vendor }} ` -DCMAKE_INSTALL_PREFIX:PATH=C:/mount/nabla/build-ct/install ` + -DNBL_CUDA_TOOLKIT_ROOT:PATH=${{ env.CUDA_CONTAINER_ROOT }} ` --profiling-output=profiling/cmake-profiling.json ` --profiling-format=google-trace @@ -635,7 +694,13 @@ jobs: with: fetch-depth: 1 sparse-checkout: | + .github/scripts smoke + src/nbl/ext/CUDAInterop/smoke + + - name: CUDA Toolkit paths + id: cuda + run: python .github/scripts/ci_cuda_toolkit.py outputs - name: Download VulkanSDK uses: Devsh-Graphics-Programming/install-vulkan-sdk-action@v1.4.0-devsh.1 @@ -646,6 +711,17 @@ jobs: install_lavapipe: true github_token: ${{ github.token }} + - name: Restore CUDA Toolkit + id: cache-cuda + uses: actions/cache@v5 + with: + path: ${{ steps.cuda.outputs.cache_root }} + key: ${{ steps.cuda.outputs.cache_key }} + restore-keys: ${{ steps.cuda.outputs.cache_restore_key }} + + - name: Verify CUDA Toolkit + run: python .github/scripts/ci_cuda_toolkit.py verify + - name: Download Nabla install artifact uses: actions/download-artifact@v8 with: @@ -668,3 +744,15 @@ jobs: - name: Smoke Flow BUILD_ONLY run: cmake -D FLOW=BUILD_ONLY -D CONFIG=${{ matrix.config }} -P smoke/RunSmokeFlow.cmake + + - name: Build CUDA interop package smoke + shell: pwsh + run: | + cmake ` + -S src/nbl/ext/CUDAInterop/smoke ` + -B smoke/cuda-interop-smoke ` + -D "CMAKE_PREFIX_PATH=${{ github.workspace }}\smoke\build-ct\install\cmake" ` + -D "NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE=ON" ` + -D "Nabla_CUDA_TOOLKIT_ROOT=${{ steps.cuda.outputs.cache_root }}" + + cmake --build smoke/cuda-interop-smoke --config ${{ matrix.config }} diff --git a/CMakeLists.txt b/CMakeLists.txt index fa74e167f0..97ece5d9f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,15 +70,15 @@ else() message(STATUS "Vulkan SDK is not found") endif() -option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" OFF) +option(NBL_COMPILE_WITH_CUDA "Build CUDA interop support?" OFF) +set(NBL_CUDA_TOOLKIT_ROOT "" CACHE PATH "Optional CUDA Toolkit root used when NBL_COMPILE_WITH_CUDA is ON") if(NBL_COMPILE_WITH_CUDA) - find_package(CUDAToolkit REQUIRED) - if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0") - message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!") - else() - message(FATAL_ERROR "CUDA version 13.0+ needed for C++14 support!") + if(NBL_CUDA_TOOLKIT_ROOT) + set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}") endif() + find_package(CUDAToolkit 13.0 REQUIRED) + message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!") endif() get_filename_component(NBL_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) @@ -183,13 +183,12 @@ option(NBL_BUILD_IMGUI "Enable nbl::ext::ImGui?" ON) option(NBL_BUILD_DEBUG_DRAW "Enable Nabla Debug Draw extension?" ON) option(NBL_BUILD_OPTIX "Enable nbl::ext::OptiX?" OFF) -if(NBL_COMPILE_WITH_CUDA) - find_package(OPTIX REQUIRED) - message(STATUS "CUDA enabled and OptiX found!") -else() - if(NBL_BUILD_OPTIX) +if(NBL_BUILD_OPTIX) + if(NOT NBL_COMPILE_WITH_CUDA) message(FATAL_ERROR "You cannot build Optix without enabled CUDA! NBL_COMPILE_WITH_CUDA must be ON!") endif() + find_package(OPTIX REQUIRED) + message(STATUS "CUDA enabled and OptiX found!") endif() option(NBL_BUILD_BULLET "Enable Bullet Physics building and integration?" OFF) @@ -313,6 +312,7 @@ if(NBL_ENABLE_CONFIG_INSTALL) set(_NBL_NABLA_CONFIG_FILES "${CMAKE_CURRENT_BINARY_DIR}/NablaConfig.cmake" "${CMAKE_CURRENT_BINARY_DIR}/NablaConfigVersion.cmake" + "${CMAKE_CURRENT_LIST_DIR}/cmake/NablaCUDAInteropHelpers.cmake" ) install(EXPORT NablaExportTargets diff --git a/CMakePresets.json b/CMakePresets.json index 3c11567f46..2c25d06953 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -14,7 +14,7 @@ "NBL_EMBED_BUILTIN_RESOURCES": "ON", "NBL_NSC_MODE": "SOURCE", "NBL_UPDATE_GIT_SUBMODULE": "OFF", - "NBL_COMPILE_WITH_CUDA": "OFF", + "NBL_COMPILE_WITH_CUDA": "ON", "NBL_BUILD_OPTIX": "OFF", "NBL_BUILD_MITSUBA_LOADER": "ON", "NBL_BUILD_RADEON_RAYS": "OFF", diff --git a/cmake/FindZLIB.cmake b/cmake/FindZLIB.cmake index f855c396b9..42aa789bee 100644 --- a/cmake/FindZLIB.cmake +++ b/cmake/FindZLIB.cmake @@ -4,4 +4,6 @@ endif() set(ZLIB_FOUND TRUE) set(ZLIB_LIBRARY ZLIB::ZLIB) -set(ZLIB_INCLUDE_DIR "${THIRD_PARTY_SOURCE_DIR}/zlib;${THIRD_PARTY_BINARY_DIR}/zlib") \ No newline at end of file +set(ZLIB_LIBRARIES ZLIB::ZLIB) +set(ZLIB_INCLUDE_DIR "${THIRD_PARTY_SOURCE_DIR}/zlib;${THIRD_PARTY_BINARY_DIR}/zlib") +set(ZLIB_INCLUDE_DIRS "${ZLIB_INCLUDE_DIR}") diff --git a/cmake/NablaCUDAInteropHelpers.cmake b/cmake/NablaCUDAInteropHelpers.cmake new file mode 100644 index 0000000000..e84b2d1a8e --- /dev/null +++ b/cmake/NablaCUDAInteropHelpers.cmake @@ -0,0 +1,28 @@ +function(nbl_target_link_cuda_interop TARGET_NAME SCOPE) + if(NOT SCOPE MATCHES "^(PRIVATE|PUBLIC|INTERFACE)$") + set(SCOPE PRIVATE) + endif() + cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${ARGN}) + target_link_libraries("${TARGET_NAME}" ${SCOPE} Nabla::ext::CUDAInterop) + set(_include_dir_entries "") + foreach(_include_dir IN LISTS _NBL_CUDA_INTEROP_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS) + if(_include_dir) + file(TO_CMAKE_PATH "${_include_dir}" _include_dir) + list(APPEND _include_dir_entries " \"${_include_dir}\"") + endif() + endforeach() + list(JOIN _include_dir_entries "," _include_dirs_json) + set(_runtime_json [=[ +{ + "cudaRuntimeIncludeDirs": [ +@_include_dirs_json@ + ] +} +]=]) + string(CONFIGURE "${_runtime_json}" _runtime_json @ONLY) + set(_runtime_json_path "$/nbl_cuda_interop_runtime.json") + if(_NBL_CUDA_INTEROP_RUNTIME_JSON) + set(_runtime_json_path "${_NBL_CUDA_INTEROP_RUNTIME_JSON}") + endif() + file(GENERATE OUTPUT "${_runtime_json_path}" CONTENT "${_runtime_json}" TARGET "${TARGET_NAME}") +endfunction() diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in index b22b3ad0d7..0464340ce3 100644 --- a/cmake/NablaConfig.cmake.in +++ b/cmake/NablaConfig.cmake.in @@ -6,6 +6,7 @@ set(Nabla_DXC_GIT_INFO_JSON_FILE "${PACKAGE_PREFIX_DIR}/include/dxc_git_info.jso set(_NBL_NABLA_LOAD_CORE OFF) set(_NBL_NABLA_LOAD_NSC OFF) +set(_NBL_NABLA_LOAD_CUDA_INTEROP OFF) set(_NBL_NABLA_COMPONENTS ${Nabla_FIND_COMPONENTS}) set(_NBL_NABLA_HAS_CORE_EXPORTS OFF) set(_NBL_NABLA_HAS_NSC_EXPORTS OFF) @@ -25,6 +26,10 @@ if(_NBL_NABLA_COMPONENTS) elseif(_NBL_NABLA_COMPONENT STREQUAL "Core") set(_NBL_NABLA_LOAD_CORE ON) set(Nabla_Core_FOUND TRUE) + elseif(_NBL_NABLA_COMPONENT STREQUAL "CUDAInterop") + set(_NBL_NABLA_LOAD_CORE ON) + set(_NBL_NABLA_LOAD_CUDA_INTEROP ON) + set(Nabla_CUDAInterop_FOUND TRUE) else() set("Nabla_${_NBL_NABLA_COMPONENT}_FOUND" FALSE) endif() @@ -80,6 +85,23 @@ if(_NBL_NABLA_LOAD_NSC) endif() endif() +if(_NBL_NABLA_LOAD_CUDA_INTEROP) + include(CMakeFindDependencyMacro) + + if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "") + set(CUDAToolkit_ROOT "${Nabla_CUDA_TOOLKIT_ROOT}") + endif() + + find_dependency(CUDAToolkit 13.0 REQUIRED) + _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND) + if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop) + target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit) + if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake") + endif() + endif() +endif() + check_required_components(Nabla) # diff --git a/examples_tests b/examples_tests index 93ca5efe58..39d02e2602 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 93ca5efe588ca85c1eaf81a486b611df98403580 +Subproject commit 39d02e26023c72a7d3241e5df85e9b7c4afacb84 diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h new file mode 100644 index 0000000000..ea360d785a --- /dev/null +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -0,0 +1,58 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +/* + CUDA SDK opt-in boundary for Nabla CUDA interop. + + Public nbl/video CUDA interop headers expose SDK-free cuda_interop::SCU* opaque handles. This header is the + explicit boundary where a consumer accepts CUDA/NVRTC SDK headers, raw CU* types, and Nabla helper APIs whose + signatures use CUDA SDK types. This happens by linking Nabla::ext::CUDAInterop and including this file, which + includes cuda.h and nvrtc.h. The CUDA SDK becomes a compile-time requirement only for that SDK opt-in + consumer. + + The exported definitions stay in Nabla because they are glue between the Nabla world and the CUDA world: + dynamic Driver API/NVRTC loader access, NVRTC program helpers, error handling, runtime header discovery, and + CUDA/Vulkan resource interop lifetime. This header only exposes the CUDA-typed signatures for that glue after + the consumer explicitly opts in. Nabla::ext::CUDAInterop is the build-system edge for this SDK-typed surface. + It is not a separate owner of these definitions. Code that only consumes Nabla::Nabla does not need CUDA SDK + headers and does not parse CUDA/NVRTC declarations. + + Keeping SDK-defined types out of Nabla's public ABI is intentional. CUDA headers have changed observable + compile-time constants across SDK versions: + - CUDA Toolkit 9.0 documented CU_CTX_FLAGS_MASK as 0x1f. CUDA 12.1, 12.5, and 13.2 define it as 0xff. + - CUDA Toolkit 9.0 documented CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS as 93. CUDA 12.1, 12.5, + and 13.2 keep 93 as CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V1 and define the unsuffixed name + as 122. + - CUDA Toolkit 9.0 documented CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR as 94. CUDA 12.1, 12.5, + and 13.2 keep 94 as CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V1 and define the unsuffixed name + as 123. + + If these SDK declarations leak through public Nabla headers, consumers can silently compile against a + different CUDA interpretation than the one used to build the interop implementation. That is especially + problematic for installed packages, plugins, and separately built downstream projects. The opaque handles + keep Nabla's public ABI independent from CUDA SDK headers. This opt-in header then validates handle + size/alignment against the SDK selected by the SDK opt-in consumer. +*/ +#ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ +#define _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ +#include "nbl/video/CUDAInteropNativeAPI.h" +namespace nbl::video::cuda_native +{ + +/* + This header specializes the SDK-free opaque handles from nbl/video/CUDAInteropHandles.h for the CUDA SDK + visible to this translation unit. After that opt-in, Nabla interop methods can be called with native CUDA/NVRTC + types such as CUdeviceptr, CUexternalSemaphore, nvrtcProgram, CUresult, and nvrtcResult. + + The size/alignment checks live in nbl/video/CUDAInteropNativeAPI.h. This exact version check is a policy helper + for SDK-typed code that wants to warn about or reject compatible-but-different SDK headers. +*/ +inline bool isBuildCUDASDKVersionExactMatch() +{ + const auto buildVersion = CCUDAHandler::getBuildCUDASDKVersion(); + return buildVersion==CUDA_VERSION; +} + +} + +#endif diff --git a/include/nbl/ext/OptiX/IDenoiser.h b/include/nbl/ext/OptiX/IDenoiser.h index 7820aa1222..bb0677657d 100644 --- a/include/nbl/ext/OptiX/IDenoiser.h +++ b/include/nbl/ext/OptiX/IDenoiser.h @@ -5,7 +5,7 @@ #ifndef __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__ #define __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__ -#include "../../../../src/nbl/video/CCUDAHandler.h" +#include "nbl/video/CCUDAHandler.h" #include #include @@ -122,4 +122,4 @@ class IDenoiser final : public core::IReferenceCounted } } -#endif \ No newline at end of file +#endif diff --git a/include/nbl/system/DefaultFuncPtrLoader.h b/include/nbl/system/DefaultFuncPtrLoader.h index 56142448c8..10fab3a454 100644 --- a/include/nbl/system/DefaultFuncPtrLoader.h +++ b/include/nbl/system/DefaultFuncPtrLoader.h @@ -11,18 +11,18 @@ namespace nbl::system { -class DefaultFuncPtrLoader final : FuncPtrLoader +class NBL_API2 DefaultFuncPtrLoader final : FuncPtrLoader { void* lib; public: inline DefaultFuncPtrLoader() : lib(nullptr) {} - NBL_API2 DefaultFuncPtrLoader(const char* name); + DefaultFuncPtrLoader(const char* name); inline DefaultFuncPtrLoader(DefaultFuncPtrLoader&& other) : DefaultFuncPtrLoader() { operator=(std::move(other)); } - NBL_API2 ~DefaultFuncPtrLoader(); + ~DefaultFuncPtrLoader(); inline DefaultFuncPtrLoader& operator=(DefaultFuncPtrLoader&& other) { @@ -40,4 +40,4 @@ class DefaultFuncPtrLoader final : FuncPtrLoader } -#endif \ No newline at end of file +#endif diff --git a/include/nbl/system/DynamicFunctionCaller.h b/include/nbl/system/DynamicFunctionCaller.h index cf99be32f0..d5642d3ea9 100644 --- a/include/nbl/system/DynamicFunctionCaller.h +++ b/include/nbl/system/DynamicFunctionCaller.h @@ -16,7 +16,7 @@ class DynamicFunctionCallerBase : public core::Unmovable { protected: static_assert(std::is_base_of::value, "Need a function pointer loader derived from `FuncPtrLoader`"); - FuncPtrLoaderT loader; + mutable FuncPtrLoaderT loader; public: //DynamicFunctionCallerBase() : loader() {} DynamicFunctionCallerBase(DynamicFunctionCallerBase&& other) : DynamicFunctionCallerBase() @@ -29,6 +29,16 @@ class DynamicFunctionCallerBase : public core::Unmovable } virtual ~DynamicFunctionCallerBase() = default; + inline bool isLibraryLoaded() const + { + return loader.isLibraryLoaded(); + } + + inline void* loadFuncPtr(const char* funcname) const + { + return loader.loadFuncPtr(funcname); + } + DynamicFunctionCallerBase& operator=(DynamicFunctionCallerBase&& other) { std::swap(loader, other.loader); @@ -41,6 +51,8 @@ class DynamicFunctionCallerBase : public core::Unmovable #define NBL_SYSTEM_IMPL_INIT_DYNLIB_FUNCPTR(FUNC_NAME) ,NBL_CONCATENATE(p , FUNC_NAME)(Base::loader.loadFuncPtr( #FUNC_NAME )) #define NBL_SYSTEM_IMPL_SWAP_DYNLIB_FUNCPTR(FUNC_NAME) std::swap(NBL_CONCATENATE(p, FUNC_NAME),other.NBL_CONCATENATE(p, FUNC_NAME)); +// Load an extra function from an already loaded dynamic library without adding it to the generated caller class. +#define NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(CALLER, FUNC_NAME) nbl::system::DynamicLibraryFunctionPointer((CALLER).loadFuncPtr(#FUNC_NAME)) #define NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS( CLASS_NAME, FUNC_PTR_LOADER_TYPE, ... ) \ class CLASS_NAME : public nbl::system::DynamicFunctionCallerBase\ diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 02f85fdac8..d6a1378dcb 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -4,24 +4,15 @@ #ifndef _NBL_VIDEO_C_CUDA_DEVICE_H_ #define _NBL_VIDEO_C_CUDA_DEVICE_H_ - -#include "nbl/video/IPhysicalDevice.h" +#include "nbl/video/declarations.h" +#include "nbl/video/CUDAInteropHandles.h" #include "nbl/video/CCUDAExportableMemory.h" #include "nbl/video/CCUDAImportedMemory.h" #include "nbl/video/CCUDAImportedSemaphore.h" - -#ifdef _NBL_COMPILE_WITH_CUDA_ - -#include "cuda.h" -#include "nvrtc.h" -#if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." -#endif - -// useful includes in the future -//#include "cudaEGL.h" -//#include "cudaVDPAU.h" +#include +#include +#include namespace nbl::video { @@ -29,13 +20,11 @@ class CCUDAHandler; class NBL_API2 CCUDADevice : public core::IReferenceCounted { - public: + public: #ifdef _WIN32 static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32; - static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32; #else static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_FD; - static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; #endif enum E_VIRTUAL_ARCHITECTURE @@ -56,66 +45,57 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted EVA_80, EVA_COUNT }; - static inline constexpr const char* virtualArchCompileOption[] = { - "-arch=compute_30", - "-arch=compute_32", - "-arch=compute_35", - "-arch=compute_37", - "-arch=compute_50", - "-arch=compute_52", - "-arch=compute_53", - "-arch=compute_60", - "-arch=compute_61", - "-arch=compute_62", - "-arch=compute_70", - "-arch=compute_72", - "-arch=compute_75", - "-arch=compute_80" - }; - inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;} - - CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, CUdevice device, core::smart_refctd_ptr&& handler); - - ~CCUDADevice(); + E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() const; - inline core::SRange geDefaultCompileOptions() const - { - return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; - } + ~CCUDADevice() override; - CUdevice getInternalObject() const { return m_handle; } + core::SRange geDefaultCompileOptions() const; - const CCUDAHandler* getHandler() const { return m_handler.get(); } + const CCUDAHandler* getHandler() const; + cuda_interop::SCUdevice getInternalObject() const; + cuda_interop::SCUcontext getContext() const; - bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); } - - size_t roundToGranularity(CUmemLocationType location, size_t size) const; + struct SExportableMemoryCreationParams + { + size_t size; + uint32_t alignment; + uint32_t locationType; + }; - core::smart_refctd_ptr createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams); + inline size_t roundToGranularity(uint32_t locationType, size_t size) const + { + if (locationType>=m_allocationGranularity.size()) + return 0u; + const auto granularity = m_allocationGranularity[locationType]; + if (size==0u || granularity==0u) + return 0u; + return ((size - 1) / granularity + 1) * granularity; + } + core::smart_refctd_ptr createExportableMemory(SExportableMemoryCreationParams&& params); core::smart_refctd_ptr importExternalMemory(core::smart_refctd_ptr&& mem); core::smart_refctd_ptr importExternalSemaphore(core::smart_refctd_ptr&& sem); private: - CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; + friend class CCUDAHandler; - static constexpr auto CudaMemoryLocationCount = 5; + static constexpr uint32_t AllocationGranularityLocationTypeCount = 5u; + struct SNativeState; + CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr&& nativeState, core::smart_refctd_ptr&& handler); + bool isValid() const; - const system::logger_opt_ptr m_logger; + const system::logger_opt_ptr m_logger; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; - IPhysicalDevice* const m_physicalDevice; + std::array m_allocationGranularity = {}; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; + bool m_valid = false; core::smart_refctd_ptr m_handler; - CUdevice m_handle; - CUcontext m_context; - std::array m_allocationGranularity; + std::unique_ptr m_native; }; } -#endif // _NBL_COMPILE_WITH_CUDA_ - #endif diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index 1c3d206906..f1ae7f6031 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -4,62 +4,43 @@ #ifndef _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ #define _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ +#include "nbl/video/declarations.h" +#include "nbl/video/CUDAInteropHandles.h" -#ifdef _NBL_COMPILE_WITH_CUDA_ - -#include "cuda.h" -#include "nvrtc.h" -#if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." -#endif - -// useful includes in the future -//#include "cudaEGL.h" -//#include "cudaVDPAU.h" +#include +#include namespace nbl::video { - class CCUDADevice; class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted { - public: - - struct SCreationParams - { - size_t size; - uint32_t alignment; - CUmemLocationType location; - }; + public: + struct SCachedCreationParams + { + size_t granularSize; + external_handle_t externalHandle; + bool deviceLocal; + }; - struct SCachedCreationParams : SCreationParams - { - size_t granularSize; - CUdeviceptr ptr; - external_handle_t externalHandle; - }; + ~CCUDAExportableMemory() override; - CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) - : m_device(std::move(device)) - , m_params(std::move(params)) - {} - ~CCUDAExportableMemory() override; + cuda_interop::SCUdeviceptr getDeviceptr() const; + core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; - CUdeviceptr getDeviceptr() const { return m_params.ptr; } + private: + friend class CCUDADevice; - const SCreationParams& getCreationParams() const { return m_params; } + struct SNativeState; + CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState); + static core::smart_refctd_ptr create(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState); - core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; - - private: - - core::smart_refctd_ptr m_device; - SCachedCreationParams m_params; + core::smart_refctd_ptr m_device; + SCachedCreationParams m_params; + std::unique_ptr m_native; }; } -#endif // _NBL_COMPILE_WITH_CUDA_ - -#endif \ No newline at end of file +#endif diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 61e9522a66..4d2324cfa6 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -7,160 +7,92 @@ #include "nbl/core/declarations.h" #include "nbl/core/definitions.h" +#include "nbl/asset/ICPUBuffer.h" #include "nbl/system/declarations.h" +#include "nbl/system/path.h" +#include "nbl/video/CUDAInteropHandles.h" -#include "nbl/video/CCUDADevice.h" +#include +#include +#include +#include - -#ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { +class CCUDADevice; +class CVulkanConnection; +class IPhysicalDevice; +namespace cuda_native +{ +// SDK-free forward declarations for the dynamic CUDA/NVRTC tables exposed by the opt-in native header. +class CUDA; +class NVRTC; +} -class NBL_API2 CCUDAHandler : public core::IReferenceCounted +namespace cuda_interop { - public: - static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); +inline constexpr const char* RuntimePathsFileName = "nbl_cuda_interop_runtime.json"; +inline constexpr uint32_t RuntimeVersionComponentCount = 2u; +using SRuntimeVersion = std::array; - inline bool defaultHandleResult(CUresult result) const - { - core::smart_refctd_ptr logger = m_logger.get(); - return defaultHandleResult(result,logger.get()); - } +struct SRuntimeIncludeDir +{ + system::path path; + std::string source; + uint32_t cudaVersion = 0u; + bool completeRuntimeHeaderSet = false; +}; - // - bool defaultHandleResult(nvrtcResult result); +struct SRuntimeCompileEnvironment +{ + core::vector includeDirs; + core::vector includeDirInfos; +}; - // - template - static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } +NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(); +NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector& explicitIncludeDirs); +NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector& explicitIncludeDirs, const core::vector& runtimePathFiles); +inline core::vector makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment) +{ + core::vector options; + for (const auto& includeDir : environment.includeDirs) + options.push_back("-I" + includeDir.generic_string()); + return options; +} +} - // +class NBL_API2 CCUDAHandler : public core::IReferenceCounted +{ + public: static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); + static uint32_t getBuildCUDASDKVersion(); + uint32_t getLoadedCUDADriverVersion() const; + cuda_interop::SRuntimeVersion getLoadedNVRTCVersion() const; + const cuda_native::CUDA& getCUDAFunctionTable() const; + const cuda_native::NVRTC& getNVRTCFunctionTable() const; + core::SRange getDefaultRuntimeIncludeOptions() const; + inline system::logger_opt_ptr getLogger() const { return m_logger.getOptRawPtr(); } + + struct SPTXResult + { + core::smart_refctd_ptr ptx; + cuda_interop::SNVRTCResult result; + }; - // - using LibLoader = system::DefaultFuncPtrLoader; - NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader - ,cuCtxCreate_v4 - ,cuDevicePrimaryCtxRetain - ,cuDevicePrimaryCtxRelease - ,cuDevicePrimaryCtxSetFlags - ,cuDevicePrimaryCtxGetState - ,cuCtxDestroy_v2 - ,cuCtxEnablePeerAccess - ,cuCtxGetApiVersion - ,cuCtxGetCurrent - ,cuCtxGetDevice - ,cuCtxGetSharedMemConfig - ,cuCtxPopCurrent_v2 - ,cuCtxPushCurrent_v2 - ,cuCtxSetCacheConfig - ,cuCtxSetCurrent - ,cuCtxSetSharedMemConfig - ,cuCtxSynchronize - ,cuDeviceComputeCapability - ,cuDeviceCanAccessPeer - ,cuDeviceGetCount - ,cuDeviceGet - ,cuDeviceGetAttribute - ,cuDeviceGetLuid - ,cuDeviceGetUuid_v2 - ,cuDeviceTotalMem_v2 - ,cuDeviceGetName - ,cuDriverGetVersion - ,cuEventCreate - ,cuEventDestroy_v2 - ,cuEventElapsedTime - ,cuEventQuery - ,cuEventRecord - ,cuEventSynchronize - ,cuFuncGetAttribute - ,cuFuncSetCacheConfig - ,cuGetErrorName - ,cuGetErrorString - ,cuGraphicsMapResources - ,cuGraphicsResourceGetMappedPointer_v2 - ,cuGraphicsResourceGetMappedMipmappedArray - ,cuGraphicsSubResourceGetMappedArray - ,cuGraphicsUnmapResources - ,cuGraphicsUnregisterResource - ,cuInit - ,cuLaunchKernel - ,cuMemAlloc_v2 - ,cuMemcpyDtoD_v2 - ,cuMemcpyDtoH_v2 - ,cuMemcpyHtoD_v2 - ,cuMemcpyDtoDAsync_v2 - ,cuMemcpyDtoHAsync_v2 - ,cuMemcpyHtoDAsync_v2 - ,cuMemGetAddressRange_v2 - ,cuMemFree_v2 - ,cuMemFreeHost - ,cuMemGetInfo_v2 - ,cuMemHostAlloc - ,cuMemHostRegister_v2 - ,cuMemHostUnregister - ,cuMemsetD32_v2 - ,cuMemsetD32Async - ,cuMemsetD8_v2 - ,cuMemsetD8Async - ,cuModuleGetFunction - ,cuModuleGetGlobal_v2 - ,cuModuleLoadDataEx - ,cuModuleLoadFatBinary - ,cuModuleUnload - ,cuOccupancyMaxActiveBlocksPerMultiprocessor - ,cuPointerGetAttribute - ,cuStreamAddCallback - ,cuStreamCreate - ,cuStreamDestroy_v2 - ,cuStreamQuery - ,cuStreamSynchronize - ,cuStreamWaitEvent - ,cuSurfObjectCreate - ,cuSurfObjectDestroy - ,cuTexObjectCreate - ,cuTexObjectDestroy - ,cuImportExternalMemory - ,cuDestroyExternalMemory - ,cuExternalMemoryGetMappedBuffer - ,cuMemUnmap - ,cuMemAddressFree - ,cuMemGetAllocationGranularity - ,cuMemAddressReserve - ,cuMemCreate - ,cuMemExportToShareableHandle - ,cuMemMap - ,cuMemRelease - ,cuMemSetAccess - ,cuMemImportFromShareableHandle - ,cuLaunchHostFunc - ,cuDestroyExternalSemaphore - ,cuImportExternalSemaphore - ,cuSignalExternalSemaphoresAsync - ,cuWaitExternalSemaphoresAsync - ,cuLogsRegisterCallback - ); - const CUDA& getCUDAFunctionTable() const {return m_cuda;} + static bool defaultHandleResult(cuda_interop::SCUresult result, const system::logger_opt_ptr& logger); + bool defaultHandleResult(cuda_interop::SCUresult result) const; + bool defaultHandleResult(cuda_interop::SNVRTCResult result) const; - NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader, - nvrtcGetErrorString, - nvrtcVersion, - nvrtcAddNameExpression, - nvrtcCompileProgram, - nvrtcCreateProgram, - nvrtcDestroyProgram, - nvrtcGetLoweredName, - nvrtcGetPTX, - nvrtcGetPTXSize, - nvrtcGetProgramLog, - nvrtcGetProgramLogSize + cuda_interop::SNVRTCResult createProgram(cuda_interop::SOutput prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); + cuda_interop::SNVRTCResult compileProgram(cuda_interop::SNVRTCProgram prog, core::SRange options) const; + cuda_interop::SNVRTCResult getProgramLog(cuda_interop::SNVRTCProgram prog, std::string& log) const; + SPTXResult getPTX(cuda_interop::SNVRTCProgram prog) const; + SPTXResult compileDirectlyToPTX( + std::string&& source, const char* filename, core::SRange nvrtcOptions, + std::string* log=nullptr, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr ); - const NVRTC& getNVRTCFunctionTable() const {return m_nvrtc;} - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); - - // inline core::SRange getSTDHeaders() { auto begin = m_headers.empty() ? nullptr:(&m_headers[0].get()); @@ -169,29 +101,9 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted inline const auto& getSTDHeaderContents() { return m_headerContents; } inline const auto& getSTDHeaderNames() { return m_headerNames; } - // - nvrtcResult createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); - inline nvrtcResult createProgram(nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) - { - return createProgram(prog,std::string(source),name,headerCount,headerContents,includeNames); - } - inline nvrtcResult createProgram(nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) - { - const auto filesize = file->getSize(); - std::string source(filesize+1u,'0'); - - system::IFile::success_t bytesRead; - file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.getBytesProcessed()); - - return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); - } - struct SCUDADeviceInfo { - CUdevice handle = {}; - CUuuid uuid = {}; - int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; + std::array uuid = {}; }; inline core::vector const& getAvailableDevices() const @@ -199,112 +111,24 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted return m_availableDevices; } - // - inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange options) - { - return m_nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin()); - } - - // - nvrtcResult getProgramLog(nvrtcProgram prog, std::string& log); - - // - struct ptx_and_nvrtcResult_t - { - core::smart_refctd_ptr ptx; - nvrtcResult result; - }; - ptx_and_nvrtcResult_t getPTX(nvrtcProgram prog); - - // - inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - std::string&& source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - nvrtcProgram program = nullptr; - nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; - auto cleanup = core::makeRAIIExiter([&]() -> void - { - if (result!=NVRTC_SUCCESS && program) - m_nvrtc.pnvrtcDestroyProgram(&program); // TODO: do we need to destroy the program if we successfully get PTX? - }); - - result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames); - return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log); - } - inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - const char* source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - return compileDirectlyToPTX(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); - } - inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - system::IFile* file, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - nvrtcProgram program = nullptr; - nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; - auto cleanup = core::makeRAIIExiter([&]() -> void - { - if (result!=NVRTC_SUCCESS && program) - m_nvrtc.pnvrtcDestroyProgram(&program); // TODO: do we need to destroy the program if we successfully get PTX? - }); - - result = createProgram(&program,file,headerCount,headerContents,includeNames); - return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log); - } - core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: + ~CCUDAHandler() override; - ~CCUDAHandler() = default; - - // - inline ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string* log) - { - if (result!=NVRTC_SUCCESS) - return {nullptr,result}; - - result = compileProgram(program,nvrtcOptions); - if (log) - getProgramLog(program,*log); - if (result!=NVRTC_SUCCESS) - return {nullptr,result}; - - return getPTX(program); - } + private: + struct SNativeState; + CCUDAHandler(std::unique_ptr&& nativeState, core::vector>&& _headers, core::smart_refctd_ptr&& _logger); - // function tables - CUDA m_cuda; - NVRTC m_nvrtc; - - // + std::unique_ptr m_native; core::vector m_availableDevices; core::vector> m_headers; core::vector m_headerContents; core::vector m_headerNamesStorage; core::vector m_headerNames; system::logger_opt_smart_ptr m_logger; - int m_version; }; -#define ASSERT_CUDA_SUCCESS(expr, handler) \ - do { \ - const auto cudaResult = (expr); \ - if (!((handler)->defaultHandleResult(cudaResult))) { \ - assert(false); \ - } \ - } while(0) - } -#endif // _NBL_COMPILE_WITH_CUDA_ - #endif diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h index 4e3bfcd085..0266706480 100644 --- a/include/nbl/video/CCUDAImportedMemory.h +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -1,42 +1,34 @@ -#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H -#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H +#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_ +#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_ -#ifdef _NBL_COMPILE_WITH_CUDA_ +#include "nbl/video/declarations.h" +#include "nbl/video/CUDAInteropHandles.h" -#include "cuda.h" -#include "nvrtc.h" -#if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." -#endif - -#endif // _NBL_COMPILE_WITH_CUDA +#include namespace nbl::video { +class CCUDADevice; + class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted { - public: - - CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, - CUexternalMemory cuExtMem) : - m_device(device), - m_src(src), - m_handle(cuExtMem) {} + public: + ~CCUDAImportedMemory() override; + cuda_interop::SCUexternalMemory getInternalObject() const; + bool getMappedBuffer(cuda_interop::SOutput mappedBuffer) const; - ~CCUDAImportedMemory() override; + private: + friend class CCUDADevice; - CUexternalMemory getInternalObject() const { return m_handle; } - CUresult getMappedBuffer(CUdeviceptr* mappedBuffer); - - private: - - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_src; - CUexternalMemory m_handle; + struct SNativeState; + CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + std::unique_ptr m_native; }; } -#endif \ No newline at end of file +#endif diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h index 2e5010fa2d..7f2b266383 100644 --- a/include/nbl/video/CCUDAImportedSemaphore.h +++ b/include/nbl/video/CCUDAImportedSemaphore.h @@ -4,43 +4,34 @@ #ifndef _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ #define _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ -#ifdef _NBL_COMPILE_WITH_CUDA_ +#include "nbl/video/declarations.h" +#include "nbl/video/CUDAInteropHandles.h" -#include "cuda.h" -#include "nvrtc.h" -#if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." -#endif - -// useful includes in the future -//#include "cudaEGL.h" -//#include "cudaVDPAU.h" +#include +#include namespace nbl::video { +class CCUDADevice; + class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted { - public: - - CUexternalSemaphore getInternalObject() const { return m_handle; } - CCUDAImportedSemaphore(core::smart_refctd_ptr device, - core::smart_refctd_ptr src, - CUexternalSemaphore semaphore) - : m_device(std::move(device)) - , m_src(std::move(src)) - , m_handle(semaphore) - {} - ~CCUDAImportedSemaphore() override; - - private: - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_src; - CUexternalSemaphore m_handle; + public: + ~CCUDAImportedSemaphore() override; + cuda_interop::SCUexternalSemaphore getInternalObject() const; + + private: + friend class CCUDADevice; + + struct SNativeState; + CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + std::unique_ptr m_native; }; } -#endif // _NBL_COMPILE_WITH_CUDA_ - #endif diff --git a/include/nbl/video/CUDAInterop.h b/include/nbl/video/CUDAInterop.h new file mode 100644 index 0000000000..efea886b96 --- /dev/null +++ b/include/nbl/video/CUDAInterop.h @@ -0,0 +1,14 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_ + +#include "nbl/video/CUDAInteropHandles.h" +#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CCUDAExportableMemory.h" +#include "nbl/video/CCUDAHandler.h" +#include "nbl/video/CCUDAImportedMemory.h" +#include "nbl/video/CCUDAImportedSemaphore.h" + +#endif diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h new file mode 100644 index 0000000000..a7664310aa --- /dev/null +++ b/include/nbl/video/CUDAInteropHandles.h @@ -0,0 +1,146 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_CUDA_INTEROP_HANDLES_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_HANDLES_H_INCLUDED_ + +#include +#include +#include +#include + +namespace nbl::video::cuda_interop +{ + +/* + SDK-free CUDA interop boundary. + + Public nbl/video/CCUDA*.h headers cannot include cuda.h or nvrtc.h, but they still need to carry CUDA interop + state and write CUDA/NVRTC handles for opt-in users. The split below keeps those two roles explicit: + - SOpaqueCUDAHandle owns handle bits and is used in Nabla object layout, parameters, and return values. + - SOutput is a non-owning output adapter. C++ does not apply user-defined conversions through T* or mutable T&, + so output parameters need a small bridge to write directly into either SCU* storage or native SDK storage. + + CUDAInteropNative.h is the only header that maps these opaque types back to CUDA/NVRTC SDK types. These helpers + are class templates with in-class member definitions, so they are inline by the language rules and add no exported + symbols. +*/ +template +struct SOpaqueCUDANativeType; + +template +concept cuda_opaque_handle = + std::is_trivially_copyable_v && + std::is_trivially_copyable_v && + sizeof(Opaque)==sizeof(Native) && + alignof(Opaque)==alignof(Native); + +template +concept cuda_native_handle_for = + requires { typename SOpaqueCUDANativeType::type; } && + std::same_as,typename SOpaqueCUDANativeType::type> && + cuda_opaque_handle>; + +/* + Non-owning output bridge for SDK-free APIs. It keeps one Nabla signature while opt-in callers can pass raw + CUDA/NVRTC output variables directly, e.g. `CUdeviceptr ptr; memory->getMappedBuffer(ptr);`. +*/ +template +struct SOutput +{ + SOutput(std::nullptr_t) : ptr(nullptr) {} + SOutput(Opaque& opaque) : ptr(&opaque) {} + SOutput(Opaque* opaque) : ptr(opaque) {} + + template + requires cuda_native_handle_for + SOutput(Native& native) : ptr(reinterpret_cast(&native)) {} + + template + requires cuda_native_handle_for + SOutput(Native* native) : ptr(reinterpret_cast(native)) {} + + Opaque& operator*() const { return *ptr; } + operator Opaque*() const { return ptr; } + explicit operator bool() const { return ptr!=nullptr; } + + private: + Opaque* ptr; +}; + +/* + Owned opaque value used in public Nabla ABI. Native reference conversions become available only after the opt-in + header specializes SOpaqueCUDANativeType for the selected CUDA SDK. +*/ +template +struct alignas(alignof(Storage)) SOpaqueCUDAHandle +{ + uint8_t value[sizeof(Storage)] = {}; + + SOpaqueCUDAHandle() = default; + + template + requires cuda_native_handle_for + SOpaqueCUDAHandle(const Native& native) + { + operator=(native); + } + + template + requires cuda_native_handle_for + operator Native&() + { + return *reinterpret_cast(value); + } + + template + requires cuda_native_handle_for + operator const Native&() const + { + return *reinterpret_cast(value); + } + + template + requires cuda_native_handle_for + Derived& operator=(const Native& native) + { + static_cast(*this) = native; + return static_cast(*this); + } + + template + requires cuda_native_handle_for + friend bool operator==(const Derived& lhs, const Native& rhs) + { + return static_cast(lhs)==rhs; + } + + template + requires cuda_native_handle_for + friend bool operator==(const Native& lhs, const Derived& rhs) + { + return lhs==static_cast(rhs); + } +}; + +#define NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(NAME, STORAGE) \ + struct NAME : SOpaqueCUDAHandle \ + { \ + using SOpaqueCUDAHandle::SOpaqueCUDAHandle; \ + using SOpaqueCUDAHandle::operator=; \ + } + +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUdevice, int32_t); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUcontext, void*); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUdeviceptr, uintptr_t); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUexternalMemory, void*); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUexternalSemaphore, void*); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUresult, int32_t); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SNVRTCResult, int32_t); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SNVRTCProgram, void*); + +#undef NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE + +} + +#endif diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h new file mode 100644 index 0000000000..6084d4a00c --- /dev/null +++ b/include/nbl/video/CUDAInteropNativeAPI.h @@ -0,0 +1,191 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_API_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_NATIVE_API_H_INCLUDED_ + +#include +#include + +#include "nbl/video/CUDAInterop.h" +#include "nbl/system/DynamicFunctionCaller.h" + +#include "cuda.h" +#include "nvrtc.h" + +namespace nbl::video::cuda_interop +{ + +#define NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(OPAQUE, NATIVE) \ + template<> struct SOpaqueCUDANativeType { using type = NATIVE; }; \ + static_assert(cuda_opaque_handle) + +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUdevice, CUdevice); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUcontext, CUcontext); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUdeviceptr, CUdeviceptr); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUexternalMemory, CUexternalMemory); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUexternalSemaphore, CUexternalSemaphore); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUresult, CUresult); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SNVRTCResult, nvrtcResult); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SNVRTCProgram, nvrtcProgram); + +#undef NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE + +} + +namespace nbl::video::cuda_native +{ + +inline constexpr int MinimumCUDADriverVersion = 13000; +inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000; +static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or higher."); + +/* + Low-level CUDA SDK boundary shared by Nabla's CUDA implementation and explicit CUDA interop opt-in users. + + This file lives under include/ because it is shared with nbl/ext/CUDAInterop/CUDAInteropNative.h, the public + opt-in header for consumers that explicitly accept CUDA SDK types. Its physical location does not make it part + of the default Nabla public interface: nbl/video/CCUDA*.h headers, Nabla::Nabla public requirements, and PCH + do not include it, so normal Nabla consumers do not need cuda.h or nvrtc.h. + + The declarations below intentionally use CUDA/NVRTC SDK types because they describe the SDK-typed glue between + raw CUDA code and Nabla's exported CUDA interop objects. Consumers enter this surface only by linking + Nabla::ext::CUDAInterop and including nbl/ext/CUDAInterop/CUDAInteropNative.h. +*/ +using LibLoader = system::DefaultFuncPtrLoader; + +/* + The CUDA/NVRTC table classes contain the calls used and tested by Nabla's interop implementation. SDK opt-in + consumers can load additional Driver API or NVRTC symbols from the same table without changing Nabla's ABI: + + auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall); + + The requested symbol must be declared by the CUDA SDK visible to that translation unit because the helper uses + decltype(cuNewCall) to preserve the native function signature. +*/ +NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader + ,cuCtxCreate_v4 + ,cuDevicePrimaryCtxRetain + ,cuDevicePrimaryCtxRelease + ,cuDevicePrimaryCtxSetFlags + ,cuDevicePrimaryCtxGetState + ,cuCtxDestroy_v2 + ,cuCtxEnablePeerAccess + ,cuCtxGetApiVersion + ,cuCtxGetCurrent + ,cuCtxGetDevice + ,cuCtxGetSharedMemConfig + ,cuCtxPopCurrent_v2 + ,cuCtxPushCurrent_v2 + ,cuCtxSetCacheConfig + ,cuCtxSetCurrent + ,cuCtxSetSharedMemConfig + ,cuCtxSynchronize + ,cuDeviceComputeCapability + ,cuDeviceCanAccessPeer + ,cuDeviceGetCount + ,cuDeviceGet + ,cuDeviceGetAttribute + ,cuDeviceGetLuid + ,cuDeviceGetUuid_v2 + ,cuDeviceTotalMem_v2 + ,cuDeviceGetName + ,cuDriverGetVersion + ,cuEventCreate + ,cuEventDestroy_v2 + ,cuEventElapsedTime + ,cuEventQuery + ,cuEventRecord + ,cuEventSynchronize + ,cuFuncGetAttribute + ,cuFuncSetCacheConfig + ,cuGetErrorName + ,cuGetErrorString + ,cuGraphicsMapResources + ,cuGraphicsResourceGetMappedPointer_v2 + ,cuGraphicsResourceGetMappedMipmappedArray + ,cuGraphicsSubResourceGetMappedArray + ,cuGraphicsUnmapResources + ,cuGraphicsUnregisterResource + ,cuInit + ,cuLaunchKernel + ,cuMemAlloc_v2 + ,cuMemcpyDtoD_v2 + ,cuMemcpyDtoH_v2 + ,cuMemcpyHtoD_v2 + ,cuMemcpyDtoDAsync_v2 + ,cuMemcpyDtoHAsync_v2 + ,cuMemcpyHtoDAsync_v2 + ,cuMemGetAddressRange_v2 + ,cuMemFree_v2 + ,cuMemFreeHost + ,cuMemGetInfo_v2 + ,cuMemHostAlloc + ,cuMemHostRegister_v2 + ,cuMemHostUnregister + ,cuMemsetD32_v2 + ,cuMemsetD32Async + ,cuMemsetD8_v2 + ,cuMemsetD8Async + ,cuModuleGetFunction + ,cuModuleGetGlobal_v2 + ,cuModuleLoadDataEx + ,cuModuleLoadFatBinary + ,cuModuleUnload + ,cuOccupancyMaxActiveBlocksPerMultiprocessor + ,cuPointerGetAttribute + ,cuStreamAddCallback + ,cuStreamCreate + ,cuStreamDestroy_v2 + ,cuStreamQuery + ,cuStreamSynchronize + ,cuStreamWaitEvent + ,cuSurfObjectCreate + ,cuSurfObjectDestroy + ,cuTexObjectCreate + ,cuTexObjectDestroy + ,cuImportExternalMemory + ,cuDestroyExternalMemory + ,cuExternalMemoryGetMappedBuffer + ,cuMemUnmap + ,cuMemAddressFree + ,cuMemGetAllocationGranularity + ,cuMemAddressReserve + ,cuMemCreate + ,cuMemExportToShareableHandle + ,cuMemMap + ,cuMemRelease + ,cuMemSetAccess + ,cuMemImportFromShareableHandle + ,cuLaunchHostFunc + ,cuDestroyExternalSemaphore + ,cuImportExternalSemaphore + ,cuSignalExternalSemaphoresAsync + ,cuWaitExternalSemaphoresAsync + ,cuLogsRegisterCallback +); + +NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader, + nvrtcGetErrorString, + nvrtcVersion, + nvrtcAddNameExpression, + nvrtcCompileProgram, + nvrtcCreateProgram, + nvrtcDestroyProgram, + nvrtcGetLoweredName, + nvrtcGetPTX, + nvrtcGetPTXSize, + nvrtcGetProgramLog, + nvrtcGetProgramLogSize +); + +#define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \ + do { \ + const auto nblCudaInteropResult = (expr); \ + if (!(handler)->defaultHandleResult(nblCudaInteropResult)) \ + assert(false); \ + } while (false) + +} + +#endif diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index 7f99d40309..89be885b0f 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -1,8 +1,15 @@ #ifndef __NBL_E_API_TYPE_H_INCLUDED__ #define __NBL_E_API_TYPE_H_INCLUDED__ -#include "nbl/core/declarations.h" #include +#ifdef _WIN32 + #ifndef WIN32_LEAN_AND_MEAN + #define WIN32_LEAN_AND_MEAN + #endif + #include +#else + #include +#endif namespace nbl::video { @@ -31,24 +38,24 @@ constexpr external_handle_t ExternalHandleNull = -1; inline bool CloseExternalHandle(external_handle_t handle) { #ifdef _WIN32 - return CloseHandle(handle); + return CloseHandle(handle); #else - return (close(handle) == 0); + return close(handle)==0; #endif } inline external_handle_t DuplicateExternalHandle(external_handle_t handle) { #ifdef _WIN32 - HANDLE re = ExternalHandleNull; + HANDLE duplicated = ExternalHandleNull; - const HANDLE cur = GetCurrentProcess(); - if (!DuplicateHandle(cur, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) - return ExternalHandleNull; + const HANDLE process = GetCurrentProcess(); + if (!DuplicateHandle(process,handle,process,&duplicated,GENERIC_ALL,0,DUPLICATE_SAME_ACCESS)) + return ExternalHandleNull; - return re; + return duplicated; #else - return dup(handle); + return dup(handle); #endif } diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h index 37f2f864bf..1a74514714 100644 --- a/include/nbl/video/declarations.h +++ b/include/nbl/video/declarations.h @@ -24,9 +24,6 @@ #include "nbl/video/CVulkanImage.h" #include "nbl/video/surface/CSurfaceVulkan.h" -// CUDA -#include "nbl/video/CCUDAHandler.h" - // utilities #include "nbl/video/utilities/CDumbPresentationOracle.h" #include "nbl/video/utilities/ICommandPoolCache.h" @@ -39,9 +36,10 @@ #include "nbl/video/utilities/CSmoothResizeSurface.h" #include "nbl/video/utilities/CDefaultSwapchainFramebuffers.h" #include "nbl/video/utilities/CAssetConverter.h" +#include "nbl/video/CUDAInterop.h" //VT //#include "nbl/video/IGPUVirtualTexture.h" -#endif \ No newline at end of file +#endif diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 692efec8bd..317cf3d2a1 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -95,12 +95,8 @@ configure_file("${NBL_ROOT_PATH}/include/nbl/config/BuildConfigOptions.h.in" "${ file(GENERATE OUTPUT "${CONFIG_OUTPUT}" INPUT "${CONFIG_DIRECOTORY}/.int/BuildConfigOptions.h.conf") nbl_install_file_spec("${CONFIG_OUTPUT}" nbl/config) -if (NBL_COMPILE_WITH_CUDA) - message(STATUS "Building with CUDA interop") - set(_NBL_COMPILE_WITH_CUDA_ ${NBL_COMPILE_WITH_CUDA}) - if (NBL_BUILD_OPTIX) - set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) - endif() +if (NBL_BUILD_OPTIX) + set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) endif() # => TODO: clean! @@ -128,6 +124,15 @@ set(NBL_CORE_SOURCES core/alloc/refctd_memory_resource.cpp core/hash/blake.cpp ) + +set(NBL_CUDA_INTEROP_SOURCES + video/CCUDADevice.cpp + video/CCUDAExportableMemory.cpp + video/CCUDAHandler.cpp + video/CCUDAImportedMemory.cpp + video/CCUDAImportedSemaphore.cpp +) + set(NBL_SYSTEM_SOURCES system/DefaultFuncPtrLoader.cpp system/IFileBase.cpp @@ -291,12 +296,6 @@ set(NBL_VIDEO_SOURCES video/CVulkanEvent.cpp video/CSurfaceVulkan.cpp -# CUDA - video/CCUDAHandler.cpp - video/CCUDADevice.cpp - video/CCUDAImportedSemaphore.cpp - video/CCUDAExportableMemory.cpp - video/CCUDAImportedMemory.cpp ) set(NBL_SCENE_SOURCES @@ -315,6 +314,7 @@ set(NABLA_SRCS_COMMON ${NBL_VIDEO_SOURCES} ${NBL_SCENE_SOURCES} ${NBL_META_SOURCES} + ${NBL_CUDA_INTEROP_SOURCES} ) if(MSVC) @@ -426,7 +426,8 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() if(NBL_COMPILE_WITH_CUDA) - target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_) + target_compile_definitions(Nabla PRIVATE _NBL_COMPILE_WITH_CUDA_) + target_include_directories(Nabla PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) endif() set(INTERFACE_BUILD_DEFINITIONS @@ -664,11 +665,6 @@ target_link_libraries(Nabla PRIVATE volk) # volk is part of public interface headers in Nabla target_compile_definitions(Nabla PUBLIC $<$:VK_USE_PLATFORM_WIN32_KHR>) -# CUDA -if (NBL_COMPILE_WITH_CUDA) - list(APPEND PUBLIC_BUILD_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRS}") -endif() - list(APPEND PUBLIC_BUILD_INCLUDE_DIRS # this should be PRIVATE, but things from /src (or /source) are sometimes included in things in /include and so examples have to put source dirs into theirs Include Path # -> TODO @@ -781,8 +777,11 @@ if(TARGET ngfx) ) endif() -# on MSVC it won't compile without this option! -target_compile_options(Nabla PUBLIC $<$:/bigobj>) +# on MSVC it won't compile without these options! +target_compile_options(Nabla PUBLIC + $<$:/bigobj> + $<$:/Zc:preprocessor> +) if(NBL_PCH) target_precompile_headers(Nabla @@ -793,11 +792,24 @@ if(NBL_PCH) ) endif() -# extensions start_tracking_variables_for_propagation_to_parent() add_subdirectory(ext EXCLUDE_FROM_ALL) propagate_changed_variables_to_parent_scope() +if(DEFINED NBL_EXT_CUDA_INTEROP_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_LIB}) + if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD) + install(TARGETS ${NBL_EXT_CUDA_INTEROP_LIB} + EXPORT NablaCUDAInteropExportTargets + COMPONENT Libraries + ) + install(EXPORT NablaCUDAInteropExportTargets + NAMESPACE Nabla:: + DESTINATION cmake + COMPONENT Libraries + ) + endif() +endif() + if(TARGET ${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB}) set_target_properties(${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB} PROPERTIES EXCLUDE_FROM_ALL OFF) nbl_install_lib_spec(${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB} "nbl/ext/FULL_SCREEN_TRIANGLE") diff --git a/src/nbl/ext/CMakeLists.txt b/src/nbl/ext/CMakeLists.txt index f3b55531c2..264cfc7c2d 100644 --- a/src/nbl/ext/CMakeLists.txt +++ b/src/nbl/ext/CMakeLists.txt @@ -38,6 +38,14 @@ if (NBL_BUILD_OPTIX) ) endif() +add_subdirectory(CUDAInterop) +if (NBL_COMPILE_WITH_CUDA) + set(NBL_EXT_CUDA_INTEROP_LIB + ${NBL_EXT_CUDA_INTEROP_LIB} + PARENT_SCOPE + ) +endif() + if (NBL_BUILD_IMGUI) add_subdirectory(ImGui) set(NBL_EXT_IMGUI_UI_INCLUDE_DIRS diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt new file mode 100644 index 0000000000..a9e1663fa9 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt @@ -0,0 +1,26 @@ +include(common) +include(NablaCUDAInteropHelpers) + +if (NBL_COMPILE_WITH_CUDA) + set(NBL_EXT_CUDA_INTEROP_LIB "NblExtCUDA_INTEROP") + + file(GLOB NBL_EXT_CUDA_INTEROP_IDE_HEADERS CONFIGURE_DEPENDS "${NBL_ROOT_PATH}/include/nbl/ext/CUDAInterop/*.h") + set(NBL_EXT_CUDA_INTEROP_IDE_SOURCES + ${NBL_EXT_CUDA_INTEROP_IDE_HEADERS} + CMakeLists.txt + README.md + ) + set_source_files_properties(${NBL_EXT_CUDA_INTEROP_IDE_SOURCES} PROPERTIES HEADER_FILE_ONLY TRUE) + + # Header-only opt-in target. It builds no artifact and adds CUDA SDK usage requirements only for native interop consumers. + add_library(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE EXCLUDE_FROM_ALL ${NBL_EXT_CUDA_INTEROP_IDE_SOURCES}) + target_link_libraries(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE + Nabla + CUDA::toolkit + ) + set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInterop") + add_library(Nabla::ext::CUDAInterop ALIAS ${NBL_EXT_CUDA_INTEROP_LIB}) + set(NBL_EXT_CUDA_INTEROP_LIB "${NBL_EXT_CUDA_INTEROP_LIB}" PARENT_SCOPE) +endif() + +add_subdirectory(smoke) diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md new file mode 100644 index 0000000000..0d8ebe2f08 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/README.md @@ -0,0 +1,182 @@ +# CUDA Interop + +## Layout + +- `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and the implementation in `src/nbl/video/CCUDA*.cpp`. +- The public Nabla headers do not include `cuda.h`, `nvrtc.h`, or other CUDA SDK headers. A consumer that only links `Nabla::Nabla` does not need a CUDA SDK install just to parse Nabla headers. +- CUDA native state is stored behind incomplete `SNativeState` members in Nabla classes. Public headers expose fixed-layout opaque value handles from `nbl/video/CUDAInteropHandles.h`. +- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the SDK opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop. +- `nbl/video/CUDAInteropNativeAPI.h` is the low-level SDK boundary used by Nabla's CUDA implementation and by opt-in consumers. It declares the dynamic CUDA/NVRTC tables and binds SDK-free opaque handles to CUDA/NVRTC SDK types. +- `nbl/ext/CUDAInterop/CUDAInteropNative.h` is the public opt-in entrypoint. It includes the native API header so SDK-typed code can use CUDA/NVRTC handles directly with Nabla interop methods. + +## CMake Usage + +`Nabla::Nabla`-only usage stays SDK-free: + +```cmake +find_package(Nabla CONFIG REQUIRED) +target_link_libraries(app PRIVATE Nabla::Nabla) +``` + +SDK-typed CUDA interop is explicit: + +```cmake +find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) +nbl_target_link_cuda_interop(native_app PRIVATE) +``` + +`nbl_target_link_cuda_interop` links `Nabla::ext::CUDAInterop` and writes `nbl_cuda_interop_runtime.json` next to the target executable during CMake generation. + +Optional overrides: + +```cmake +find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) +nbl_target_link_cuda_interop(native_app PRIVATE + INCLUDE_DIRS "${cuda_runtime_headers}" +) + +nbl_target_link_cuda_interop(native_app PRIVATE + RUNTIME_JSON "${CMAKE_CURRENT_BINARY_DIR}/$/my_cuda_runtime.json" +) +``` + +Consumers can also choose the SDK used for SDK-typed compilation with: + +```cmake +cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT= +``` + +This affects SDK opt-in compilation and generated runtime header discovery only. It does not rebuild Nabla and does not change the `Nabla.dll` ABI. + +## SDK Opt-In Usage + +```cpp +#include "nbl/ext/CUDAInterop/CUDAInteropNative.h" + +auto handler = nbl::video::CCUDAHandler::create(system, std::move(logger)); +auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDevice); + +const bool exactBuildSDK = nbl::video::cuda_native::isBuildCUDASDKVersionExactMatch(); +if (!exactBuildSDK) +{ + // Warn here, or return false if this application requires exact same-SDK policy. +} + +auto memory = cudaDevice->createExportableMemory({ + .size = size, + .alignment = alignment, + .locationType = CU_MEM_LOCATION_TYPE_DEVICE, +}); + +auto& cu = handler->getCUDAFunctionTable(); +auto& nvrtc = handler->getNVRTCFunctionTable(); +int driverVersion = 0; +NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuDriverGetVersion(&driverVersion), handler); + +CUdeviceptr mapped = 0; +if (importedMemory) + importedMemory->getMappedBuffer(mapped); + +CUdeviceptr exported = memory->getDeviceptr(); + +nvrtcProgram program = nullptr; +auto createResult = handler->createProgram(program, std::string(cudaSource), "kernel.cu"); + +std::string log; +auto compile = handler->compileDirectlyToPTX( + std::move(cudaSource), + "kernel.cu", + cudaDevice->geDefaultCompileOptions(), + &log +); +``` + +SDK opt-in access is not a full CUDA wrapper. It is the glue between Nabla resource lifetime and raw CUDA interop: + +- `CCUDAHandler::getCUDAFunctionTable` and `CCUDAHandler::getNVRTCFunctionTable` expose the loaded Driver API and NVRTC tables after SDK opt-in. +- The shipped tables contain the CUDA/NVRTC calls used and tested by Nabla. SDK opt-in code can load extra symbols from the same dynamic table without changing Nabla's ABI. The symbol name must be declared by the CUDA SDK headers visible to that translation unit: + +```cpp +auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall); +if (pcuNewCall) + pcuNewCall(...); +``` + +- `cuda_interop::SCU*`, `SCUresult`, `SNVRTCResult`, and `SNVRTCProgram` are SDK-free opaque values in Nabla headers. After including `CUDAInteropNative.h`, they become constructible from and convertible to matching CUDA/NVRTC SDK types such as `CUdeviceptr`, `CUexternalSemaphore`, `CUresult`, `nvrtcResult`, and `nvrtcProgram`. +- CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI. +- SDK-free output parameters use `cuda_interop::SOutput<...>`. SDK-free code can pass opaque `SCU*` values or pointers. SDK opt-in code can pass matching native CUDA/NVRTC output variables directly, for example `CUdeviceptr mapped; importedMemory->getMappedBuffer(mapped);` or `nvrtcProgram program; handler->createProgram(program, ...)`. +- `CCUDAHandler::compileProgram`, `getProgramLog`, `getPTX`, and `compileDirectlyToPTX` are SDK-free Nabla methods. SDK opt-in code can use their results with native `nvrtcProgram` / `nvrtcResult` because the opaque conversions are enabled by `CUDAInteropNative.h`. +- `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler)` is available for call sites that intentionally assert on CUDA/NVRTC failures. Pass a pointer-like `CCUDAHandler` handle. Nabla implementation code should still prefer explicit error handling and clean returns. +- `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule. + +Smoke examples: + +- `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` checks that `Nabla::Nabla` headers stay SDK-free. +- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks SDK opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, extra dynamic symbol loading, and raw interop usage. + +## ABI + +- `CCUDAHandler`, `CCUDADevice`, `CCUDAExportableMemory`, `CCUDAImportedMemory`, and `CCUDAImportedSemaphore` are exported from `Nabla.dll` through the normal Nabla ABI. +- Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes. +- Opaque handle types are small trivially-copyable byte arrays with fixed size/alignment chosen to match CUDA SDK handle storage. The SDK opt-in header validates this with `static_assert`s against the SDK used by the consumer. +- CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state. +- SDK-sized arrays, CUDA enum storage, and CUDA implementation state stay private to Nabla. +- A consumer can build SDK opt-in code with its own compatible SDK independently from the SDK used to build Nabla. SDK-typed code can check `cuda_native::isBuildCUDASDKVersionExactMatch()` when exact CUDA SDK version matching is required. +- Runtime include-option construction is header-only and is not part of the exported ABI. +- The loaded CUDA driver and NVRTC runtime are validated at runtime. + +## Runtime Header Discovery + +NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`. This is a runtime concern of applications that compile CUDA source with NVRTC, not a `Nabla::Nabla` package requirement. + +- `nbl_target_link_cuda_interop` generates `nbl_cuda_interop_runtime.json` for the target that opted into SDK-typed CUDA interop. +- The JSON is a build artifact. Nabla packages do not install host-specific CUDA paths. +- Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`. +- `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application. +- Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots. +- Runtime lookup records the source of every accepted include root and parses `CUDA_VERSION` from `cuda.h` when available. The startup report prints the primary include root, its source, its parsed CUDA version, and the full search order. +- The first include root is not required to match the SDK used to build Nabla. It is the first `-I` path visible to NVRTC, so the first path containing a requested header wins just like normal C/C++ include search. +- If the primary runtime header root is incomplete or reports a different CUDA version than the loaded NVRTC runtime, Nabla logs a warning. This is diagnostic policy, not an automatic hard failure. +- The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths. +- `CCUDAHandler` captures discovered include directories when it is created. `CCUDAHandler::compileDirectlyToPTX` reuses those exact include options, so the startup report matches the NVRTC search paths used by that handler. + +Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit. + +Nabla could ship an app-local bundle of selected CUDA runtime headers and make it available to runtime discovery. That model is allowed by the NVIDIA CUDA EULA for the components listed in Attachment A. Nabla intentionally does not bundle these headers. Because of that, end users should prefer an official CUDA runtime/header package for production machines. An installed toolkit also works, but the full toolkit is mainly for developers compiling Nabla or SDK-typed CUDA code. + +NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See: + +- https://docs.nvidia.com/cuda/eula/#distribution +- https://docs.nvidia.com/cuda/eula/#attachment-a + +This means the Attachment A header groups below can be redistributed with applications under the EULA terms. It does not mean the full CUDA SDK can be redistributed. Applications that need NVRTC runtime compilation can decide whether to ship the allowed headers, depend on an official runtime/header package, or point discovery at an installed toolkit/header package. + +Attachment A lists header groups relevant to NVRTC runtime compilation: + +- NVIDIA Runtime Compilation Library and Header: `nvrtc.h` +- CUDA Floating Point Type Headers: `cuda_fp16.h`, `cuda_fp16.hpp`, `cuda_bf16.h`, `cuda_bf16.hpp`, `cuda_fp8.h`, `cuda_fp8.hpp`, `cuda_fp6.h`, `cuda_fp6.hpp`, `cuda_fp4.h`, `cuda_fp4.hpp` +- CUDA Headers for Runtime Compilation: `crt/host_defines.h`, `cuComplex.h`, `cuda_awbarrier_helpers.h`, `cuda_awbarrier_primitives.h`, `cuda_awbarrier.h`, `cuda_pipeline_helpers.h`, `cuda_pipeline_primitives.h`, `cuda_pipeline.h`, `cuda_runtime_api.h`, `cuda.h`, `cuda/std/tuple`, `cuda/std/type_traits`, `cuda/std/utility`, `device_types.h`, `vector_functions.h`, and `vector_types.h` + +CuPy documents the same NVRTC issue for CUDA 12.2+. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They show the common `vector_types.h` failure and recommend CUDA runtime header packages for PyPI/system package installs: + +- https://docs.cupy.dev/en/v13.5.0/install.html#cupy-always-raises-nvrtc-error-compilation-6 +- https://github.com/cupy/cupy/issues/8466 + +## CUDA ON/OFF Builds + +- SDK-free public headers stay stable for CUDA ON and CUDA OFF Nabla builds. +- Nabla implementation `.cpp` files include CUDA SDK headers only behind `_NBL_COMPILE_WITH_CUDA_`. +- CUDA OFF implementations are local stubs in the same `.cpp` files. Factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols. +- The Nabla source list stays stable, so CUDA interop `.cpp` files remain visible in IDE projects for both CUDA ON and CUDA OFF builds. + +## Related Designs + +The split follows the same boundary pattern used by mature GPU projects: public/common headers avoid vendor SDK requirements, vendor SDK access is explicit, and implementation details stay outside the public API. + +- OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79 +- OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61 +- Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27 +- Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30 +- OpenMM keeps the CUDA platform boundary on OpenMM types/properties in `CudaPlatform.h`, while `CudaContext.h` is the CUDA-specific low-level header that includes CUDA SDK headers and exposes `CUmodule` / `CUfunction`: https://github.com/openmm/openmm/blob/master/platforms/cuda/include/CudaPlatform.h#L48-L120 and https://github.com/openmm/openmm/blob/master/platforms/cuda/include/CudaContext.h#L32-L52 +- GROMACS gates CUDA source handling behind `GMX_GPU_CUDA` in the library build and keeps CUDA runtime types in internal GPU utility headers: https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/CMakeLists.txt#L339-L367 and https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/gpu_utils/gputraits.cuh#L44-L58 +- ONNX Runtime keeps the public C API provider-neutral and routes CUDA through provider-specific bridge/factory code: https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_c_api.h#L1-L80 and https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/session/provider_bridge_ort.cc#L110-L150 diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt new file mode 100644 index 0000000000..e16d3feac0 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 3.30) +project(NblExtCUDAInteropSmoke CXX) + +option(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE "Build the CUDA native opt-in smoke from an installed Nabla package." OFF) +set(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON "" CACHE FILEPATH "Optional CUDA interop runtime JSON path used by the native smoke.") + +if(NOT TARGET Nabla::Nabla) + set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core) + if(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE) + list(APPEND _NBL_CUDA_INTEROP_SMOKE_COMPONENTS CUDAInterop) + endif() + find_package(Nabla REQUIRED CONFIG COMPONENTS ${_NBL_CUDA_INTEROP_SMOKE_COMPONENTS}) +endif() + +enable_testing() + +function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE) + add_executable(${TARGET_NAME} ${SOURCE_FILE}) + target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20) + + add_test(NAME ${TARGET_NAME} COMMAND $) +endfunction() + +nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp) +target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla) + +if(TARGET Nabla::ext::CUDAInterop) + nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp) + set(_nbl_cuda_interop_smoke_args PRIVATE) + if(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON) + list(APPEND _nbl_cuda_interop_smoke_args RUNTIME_JSON "${NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}") + target_compile_definitions(NblExtCUDAInteropNativeOptInSmoke PRIVATE NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON="${NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}") + endif() + nbl_target_link_cuda_interop(NblExtCUDAInteropNativeOptInSmoke ${_nbl_cuda_interop_smoke_args}) +endif() diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp new file mode 100644 index 0000000000..d1c15822cd --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -0,0 +1,192 @@ +#include "nbl/ext/CUDAInterop/CUDAInteropNative.h" +#include "nbl/system/IApplicationFramework.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifndef CUDA_VERSION +#error "Nabla::ext::CUDAInterop must expose CUDA SDK headers." +#endif + +namespace +{ +using namespace nbl; +using namespace nbl::video; + +[[maybe_unused]] bool compileVulkanCudaInteropRecipe( + CCUDADevice& cudaDevice, + ILogicalDevice* vulkanDevice, + core::smart_refctd_ptr vulkanMemory, + core::smart_refctd_ptr vulkanSemaphore) +{ + auto cudaMemory = cudaDevice.createExportableMemory({ + .size = 4096, + .alignment = 4096, + .locationType = CU_MEM_LOCATION_TYPE_DEVICE, + }); + if (!cudaMemory) + return false; + + auto exportedToVulkan = cudaMemory->exportAsMemory(vulkanDevice); + auto importedFromVulkan = cudaDevice.importExternalMemory(std::move(vulkanMemory)); + auto importedSemaphore = cudaDevice.importExternalSemaphore(std::move(vulkanSemaphore)); + + CUdeviceptr mappedVulkanMemory = 0; + if (importedFromVulkan) + importedFromVulkan->getMappedBuffer(mappedVulkanMemory); + + const CUdeviceptr cudaDevicePtr = cudaMemory->getDeviceptr(); + CUexternalSemaphore cudaSemaphore = nullptr; + if (importedSemaphore) + cudaSemaphore = importedSemaphore->getInternalObject(); + return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore; +} + +bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) +{ + auto& cuda = handler.getCUDAFunctionTable(); + + CUcontext context = nullptr; + if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS) + return false; + + CUcontext poppedContext = nullptr; + bool contextPushed = false; + auto releaseContext = [&]() + { + if (context) + { + if (contextPushed) + cuda.pcuCtxPopCurrent_v2(&poppedContext); + cuda.pcuDevicePrimaryCtxRelease_v2(device); + } + }; + + if (cuda.pcuCtxPushCurrent_v2(context)!=CUDA_SUCCESS) + { + releaseContext(); + return false; + } + contextPushed = true; + + constexpr std::array input = {0x12345678u, 0x90abcdefu, 0xfedcba09u, 0x87654321u}; + std::array output = {}; + + CUdeviceptr deviceMemory = 0; + bool ok = cuda.pcuMemAlloc_v2(&deviceMemory, sizeof(input))==CUDA_SUCCESS; + if (ok) + ok = cuda.pcuMemcpyHtoD_v2(deviceMemory,input.data(),sizeof(input))==CUDA_SUCCESS; + if (ok) + ok = cuda.pcuMemcpyDtoH_v2(output.data(),deviceMemory,sizeof(output))==CUDA_SUCCESS; + if (deviceMemory) + ok = cuda.pcuMemFree_v2(deviceMemory)==CUDA_SUCCESS && ok; + + releaseContext(); + return ok && std::ranges::equal(input, output); +} + +bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler) +{ + constexpr const char* Source = R"cuda( + #include + extern "C" __global__ void fp16_probe(unsigned short* out) + { + out[0] = sizeof(__half); + } + )cuda"; + + std::string log; + auto compile = handler.compileDirectlyToPTX( + std::string(Source), + "cuda_fp16_discovery_probe.cu", + {nullptr,nullptr}, + &log, + 0, + nullptr, + nullptr + ); + return compile.result==NVRTC_SUCCESS && compile.ptx && compile.ptx->getSize()>0u; +} + +bool nativeNVRTCOutputProbe(CCUDAHandler& handler) +{ + constexpr const char* Source = R"cuda( + extern "C" __global__ void native_output_probe() {} + )cuda"; + + nvrtcProgram program = nullptr; + const auto result = handler.createProgram(program,std::string(Source),"native_output_probe.cu"); + if (program) + handler.getNVRTCFunctionTable().pnvrtcDestroyProgram(&program); + return result==NVRTC_SUCCESS; +} +} + +class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramework +{ + using base_t = nbl::system::IApplicationFramework; + +public: + using base_t::base_t; + + bool onAppInitialized(nbl::core::smart_refctd_ptr&&) override + { + if (!isAPILoaded()) + return false; + + static_assert(nbl::video::cuda_interop::cuda_opaque_handle); + static_assert(nbl::video::cuda_interop::cuda_opaque_handle); + [[maybe_unused]] const bool exactBuildSDK = nbl::video::cuda_native::isBuildCUDASDKVersionExactMatch(); + + #ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON + const nbl::core::vector explicitIncludeDirs; + const nbl::core::vector runtimePathFiles = {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}; + const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment(explicitIncludeDirs, runtimePathFiles); + if (!std::filesystem::exists(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON)) + return false; + #else + const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment(); + #endif + const auto includeOptions = nbl::video::cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment); + const auto hasRuntimeHeaders = std::find_if(runtimeEnvironment.includeDirs.begin(),runtimeEnvironment.includeDirs.end(),[](const auto& includeDir) { + return std::filesystem::exists(includeDir/"cuda_fp16.h") || std::filesystem::exists(includeDir/"cuda_runtime_api.h"); + })!=runtimeEnvironment.includeDirs.end(); + if (includeOptions.empty() || !hasRuntimeHeaders) + return false; + + auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr); + if (!handler) + return true; + + auto pcuDriverGetVersion = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuDriverGetVersion); + int loadedDriverVersion = 0; + if (!pcuDriverGetVersion || pcuDriverGetVersion(&loadedDriverVersion)!=CUDA_SUCCESS || loadedDriverVersion==0) + return false; + + if (!nativeNVRTCOutputProbe(*handler)) + return false; + + if (!cudaFp16HeaderCompileProbe(*handler)) + return false; + + int deviceCount = 0; + if (handler->getCUDAFunctionTable().pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount==0) + return true; + + CUdevice device = {}; + if (handler->getCUDAFunctionTable().pcuDeviceGet(&device,0)!=CUDA_SUCCESS) + return false; + + return cudaDriverRoundtrip(*handler, device); + } + + void workLoopBody() override {} + bool keepRunning() override { return false; } +}; + +NBL_MAIN_FUNC(CUDAInteropNativeOptInSmoke) diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp new file mode 100644 index 0000000000..73307599b1 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp @@ -0,0 +1,40 @@ +#include "nabla.h" + +#include "nbl/system/IApplicationFramework.h" +#include "nbl/video/CUDAInterop.h" +#include + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#error "Nabla consumers must not get the CUDA opt-in define." +#endif + +#ifdef CUDA_VERSION +#error "Nabla consumers must not include CUDA SDK headers." +#endif + +namespace +{ + +class CUDAInteropPublicBoundarySmoke final : public nbl::system::IApplicationFramework +{ + using base_t = nbl::system::IApplicationFramework; + +public: + using base_t::base_t; + + bool onAppInitialized(nbl::core::smart_refctd_ptr&&) override + { + static_assert(std::is_class_v); + static_assert(std::is_class_v); + static_assert(std::is_class_v); + static_assert(std::is_class_v); + return isAPILoaded(); + } + + void workLoopBody() override {} + bool keepRunning() override { return false; } +}; + +} + +NBL_MAIN_FUNC(CUDAInteropPublicBoundarySmoke) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 27f8f6f906..25caa0162b 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -1,72 +1,138 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CUDAInterop.h" + +namespace nbl::video +{ + +CCUDADevice::E_VIRTUAL_ARCHITECTURE CCUDADevice::getVirtualArchitecture() const +{ + return m_virtualArchitecture; +} + +core::SRange CCUDADevice::geDefaultCompileOptions() const +{ + return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; +} + +const CCUDAHandler* CCUDADevice::getHandler() const +{ + return m_handler.get(); +} + +} + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" #ifdef _WIN32 #include #endif -#include "nbl/video/CCUDAImportedMemory.h" - -#ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { +namespace +{ + +constexpr const char* VirtualArchCompileOption[] = { + "-arch=compute_30", + "-arch=compute_32", + "-arch=compute_35", + "-arch=compute_37", + "-arch=compute_50", + "-arch=compute_52", + "-arch=compute_53", + "-arch=compute_60", + "-arch=compute_61", + "-arch=compute_62", + "-arch=compute_70", + "-arch=compute_72", + "-arch=compute_75", + "-arch=compute_80" +}; + +static_assert(sizeof(VirtualArchCompileOption)/sizeof(*VirtualArchCompileOption)==CCUDADevice::EVA_COUNT); + +static CUmemAllocationHandleType getAllocationHandleType() +{ +#ifdef _WIN32 + return CU_MEM_HANDLE_TYPE_WIN32; +#else + return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +#endif +} + +} + CCUDADevice::CCUDADevice( - core::smart_refctd_ptr&& vulkanConnection, - IPhysicalDevice* const vulkanDevice, + core::smart_refctd_ptr&& vulkanConnection, + IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, - CUdevice device, - core::smart_refctd_ptr&& handler) : + std::unique_ptr&& nativeState, + core::smart_refctd_ptr&& handler) : m_logger(vulkanDevice->getDebugCallback()->getLogger()), - m_defaultCompileOptions(), - m_vulkanConnection(std::move(vulkanConnection)), - m_physicalDevice(vulkanDevice), - m_virtualArchitecture(virtualArchitecture), - m_handle(device), + m_defaultCompileOptions(), + m_vulkanConnection(std::move(vulkanConnection)), + m_virtualArchitecture(virtualArchitecture), m_handler(std::move(handler)), - m_allocationGranularity{} + m_native(std::move(nativeState)) { + assert(m_native); + m_defaultCompileOptions.push_back("--std=c++14"); - m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); + m_defaultCompileOptions.push_back(VirtualArchCompileOption[m_virtualArchitecture]); m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); - const auto& cu = m_handler->getCUDAFunctionTable(); + const auto& cu = m_handler->getCUDAFunctionTable(); - ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_context, nullptr, 0, m_handle), m_handler); - ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_context), m_handler); + if (!m_handler->defaultHandleResult(cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle))) + return; + if (!m_handler->defaultHandleResult(cu.pcuCtxSetCurrent(m_native->context))) + return; for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType) { - - #ifdef _WIN32 - OBJECT_ATTRIBUTES metadata = { - .Length = sizeof(OBJECT_ATTRIBUTES) - }; - #endif - - const auto prop = CUmemAllocationProp{ - .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, - .location = { .type = static_cast(locationType), .id = m_handle }, - #ifdef _WIN32 - .win32HandleMetaData = &metadata, - #endif - }; - ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler); +#ifdef _WIN32 + OBJECT_ATTRIBUTES metadata = { + .Length = sizeof(OBJECT_ATTRIBUTES) + }; +#endif + + const auto prop = CUmemAllocationProp{ + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = getAllocationHandleType(), + .location = { .type = static_cast(locationType), .id = m_native->handle }, +#ifdef _WIN32 + .win32HandleMetaData = &metadata, +#endif + }; + if (!m_handler->defaultHandleResult(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM))) + return; } + m_valid = true; } -size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const +cuda_interop::SCUdevice CCUDADevice::getInternalObject() const { - return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; + return m_native->handle; } -CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const +cuda_interop::SCUcontext CCUDADevice::getContext() const { - const auto& cu = m_handler->getCUDAFunctionTable(); + return m_native->context; +} + +static bool isDeviceLocal(CUmemLocationType location) +{ + return location==CU_MEM_LOCATION_TYPE_DEVICE; +} + +static CUresult reserveAddressAndMapMemory(const CCUDAHandler& handler, CUdevice nativeDevice, CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) +{ + const auto& cu = handler.getCUDAFunctionTable(); CUdeviceptr ptr = 0; if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) @@ -74,19 +140,19 @@ CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t siz if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) { - ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler); + handler.defaultHandleResult(cu.pcuMemAddressFree(ptr, size)); return err; } CUmemAccessDesc accessDesc = { - .location = { .type = location, .id = m_handle }, + .location = { .type = location, .id = nativeDevice }, .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, }; if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) { - ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), m_handler); - ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler); + handler.defaultHandleResult(cu.pcuMemUnmap(ptr, size)); + handler.defaultHandleResult(cu.pcuMemAddressFree(ptr, size)); return err; } @@ -95,28 +161,36 @@ CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t siz return CUDA_SUCCESS; } -core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams) +core::smart_refctd_ptr CCUDADevice::createExportableMemory(SExportableMemoryCreationParams&& inParams) { - CCUDAExportableMemory::SCachedCreationParams params = { inParams }; + const auto handler = getHandler(); + const auto location = static_cast(inParams.locationType); + + CCUDAExportableMemory::SCachedCreationParams params = { + .granularSize = roundToGranularity(inParams.locationType, inParams.size), + .deviceLocal = isDeviceLocal(location) + }; + if (params.granularSize==0u) + return nullptr; + + auto& cu = handler->getCUDAFunctionTable(); - auto& cu = m_handler->getCUDAFunctionTable(); - #ifdef _WIN32 OBJECT_ATTRIBUTES metadata = { - .Length = sizeof(OBJECT_ATTRIBUTES) + .Length = sizeof(OBJECT_ATTRIBUTES) }; #endif - const auto prop = CUmemAllocationProp{ + const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, - .location = { .type = params.location, .id = m_handle }, + .requestedHandleTypes = getAllocationHandleType(), + .location = { .type = location, .id = m_native->handle }, #ifdef _WIN32 .win32HandleMetaData = &metadata, #endif }; - params.granularSize = roundToGranularity(params.location, params.size); + auto nativeState = std::make_unique(); CUmemGenericAllocationHandle mem; if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) @@ -128,30 +202,33 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { m_logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); - ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler); + handler->defaultHandleResult(cu.pcuMemRelease(mem)); return nullptr; } - if (const auto err = reserveAddressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) + if (const auto err = reserveAddressAndMapMemory(*handler,m_native->handle,&nativeState->ptr, params.granularSize, inParams.alignment, location, mem); CUDA_SUCCESS != err) { m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); - ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler); + handler->defaultHandleResult(cu.pcuMemRelease(mem)); - bool closeSucceed = CloseExternalHandle(params.externalHandle); - assert(closeSucceed); + if (!CloseExternalHandle(params.externalHandle)) + m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); return nullptr; } if (const auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) { - bool closeSucceed = CloseExternalHandle(params.externalHandle); - assert(closeSucceed); + handler->defaultHandleResult(err); + handler->defaultHandleResult(cu.pcuMemUnmap(nativeState->ptr, params.granularSize)); + handler->defaultHandleResult(cu.pcuMemAddressFree(nativeState->ptr, params.granularSize)); + if (!CloseExternalHandle(params.externalHandle)) + m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params)); + return CCUDAExportableMemory::create(core::smart_refctd_ptr(this),std::move(params),std::move(nativeState)); } core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) @@ -179,7 +256,10 @@ core::smart_refctd_ptr CCUDADevice::importExternalMemory(co m_logger.log("Fail to import external memory into CUDA!", system::ILogger::ELL_ERROR); return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), cuExtMem); + return core::smart_refctd_ptr( + new CCUDAImportedMemory(core::smart_refctd_ptr(this),std::move(mem),std::make_unique(cuExtMem)), + core::dont_grab + ); } core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&& sema) @@ -193,10 +273,9 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { #ifdef _WIN32 .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, - // TODO(kevinyu): Fix this later. Make it compile first. .handle = {.win32 = {.handle = sema->getExternalHandle() }}, #else - .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, + .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, .handle = {.fd = sema->getExternalHandle()} #endif }; @@ -209,12 +288,79 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), cusema); + return core::smart_refctd_ptr( + new CCUDAImportedSemaphore(core::smart_refctd_ptr(this),std::move(sema),std::make_unique(cusema)), + core::dont_grab + ); } CCUDADevice::~CCUDADevice() { - ASSERT_CUDA_SUCCESS(m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context), m_handler); + if (m_native->context) + m_handler->defaultHandleResult(m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context)); +} + +bool CCUDADevice::isValid() const +{ + return m_valid; +} + +} + +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDADevice::SNativeState {}; + +CCUDADevice::CCUDADevice( + core::smart_refctd_ptr&& vulkanConnection, + IPhysicalDevice* const vulkanDevice, + const E_VIRTUAL_ARCHITECTURE virtualArchitecture, + std::unique_ptr&& nativeState, + core::smart_refctd_ptr&& handler) + : m_logger(nullptr) + , m_vulkanConnection(std::move(vulkanConnection)) + , m_virtualArchitecture(virtualArchitecture) + , m_valid(false) + , m_handler(std::move(handler)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +CCUDADevice::~CCUDADevice() = default; + +bool CCUDADevice::isValid() const +{ + return false; +} + +cuda_interop::SCUdevice CCUDADevice::getInternalObject() const +{ + return {}; +} + +cuda_interop::SCUcontext CCUDADevice::getContext() const +{ + return {}; +} + +core::smart_refctd_ptr CCUDADevice::createExportableMemory(SExportableMemoryCreationParams&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&&) +{ + return nullptr; } } diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index 66cbbdcf4f..9333a39f54 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -2,27 +2,40 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDAExportableMemory.h" -#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" + namespace nbl::video { +CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_params(std::move(params)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +core::smart_refctd_ptr CCUDAExportableMemory::create(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) +{ + return core::smart_refctd_ptr( + new CCUDAExportableMemory(std::move(device),std::move(params),std::move(nativeState)), + core::dont_grab + ); +} + core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const { auto pd = device->getPhysicalDevice(); uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1; uint32_t vram = pd->getDeviceLocalMemoryTypeBits(); - switch (m_params.location) - { - case CU_MEM_LOCATION_TYPE_DEVICE: memoryTypeBits &= vram; break; - case CU_MEM_LOCATION_TYPE_HOST_NUMA: - case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: - case CU_MEM_LOCATION_TYPE_HOST: memoryTypeBits &= ~vram; break; - default: break; - } + if (m_params.deviceLocal) + memoryTypeBits &= vram; + else + memoryTypeBits &= ~vram; IDeviceMemoryBacked::SDeviceMemoryRequirements req = {}; req.size = m_params.granularSize; @@ -41,14 +54,58 @@ CCUDAExportableMemory::~CCUDAExportableMemory() { const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), m_device->getHandler()); + m_device->getHandler()->defaultHandleResult(cu.pcuMemUnmap(m_native->ptr, m_params.granularSize)); + + m_device->getHandler()->defaultHandleResult(cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize)); - ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize), m_device->getHandler()); + if (!CloseExternalHandle(m_params.externalHandle)) + m_device->getHandler()->getLogger().log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); - bool closeSucceed = CloseExternalHandle(m_params.externalHandle); - assert(closeSucceed); +} + +cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const +{ + return m_native->ptr; +} } + +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAExportableMemory::SNativeState {}; + +CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_params(std::move(params)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +core::smart_refctd_ptr CCUDAExportableMemory::create(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) +{ + return core::smart_refctd_ptr( + new CCUDAExportableMemory(std::move(device),std::move(params),std::move(nativeState)), + core::dont_grab + ); +} + +CCUDAExportableMemory::~CCUDAExportableMemory() = default; + +cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const +{ + return {}; +} + +core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const +{ + return nullptr; +} + } -#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file +#endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 060afe6631..c07af698b1 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -2,60 +2,527 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDAHandler.h" -#include "nbl/system/CFileView.h" +#include "nbl/video/CUDAInterop.h" + +#include "nlohmann/json.hpp" + +#include +#include +#include +#include +#include + +namespace nbl::video::cuda_interop +{ +namespace +{ + +#if defined(_NBL_PLATFORM_WINDOWS_) +inline constexpr char EnvironmentPathListSeparator = ';'; +#else +inline constexpr char EnvironmentPathListSeparator = ':'; +#endif + +std::string readEnvironmentVariable(const char* name) +{ + if (const char* value = std::getenv(name)) + return value; + return {}; +} + +bool isDirectory(const system::path& path) +{ + std::error_code error; + return std::filesystem::exists(path,error) && std::filesystem::is_directory(path,error); +} + +bool isRegularFile(const system::path& path) +{ + std::error_code error; + return std::filesystem::exists(path,error) && std::filesystem::is_regular_file(path,error); +} + +system::path normalizedAbsolute(system::path path) +{ + std::error_code error; + auto absolute = std::filesystem::absolute(path,error); + if (error) + absolute = std::move(path); + return absolute.lexically_normal(); +} + +bool looksLikeCUDAIncludeDir(const system::path& path) +{ + if (!isDirectory(path)) + return false; + + return isRegularFile(path/"cuda_fp16.h") || + isRegularFile(path/"cuda_runtime_api.h") || + isRegularFile(path/"vector_types.h") || + isRegularFile(path/"cuda.h") || + isRegularFile(path/"nv"/"target"); +} + +uint32_t readCUDAVersion(const system::path& includeDir) +{ + std::ifstream input(includeDir/"cuda.h"); + if (!input) + return 0u; + + std::string line; + while (std::getline(input,line)) + { + std::istringstream stream(line); + std::string directive; + stream >> directive; + if (directive!="#define") + continue; + + std::string name; + stream >> name; + if (name!="CUDA_VERSION") + continue; + + uint32_t version = 0u; + if (stream >> version) + return version; + } + return 0u; +} + +bool looksLikeCompleteRuntimeHeaderSet(const system::path& includeDir) +{ + return isRegularFile(includeDir/"cuda.h") && + isRegularFile(includeDir/"cuda_runtime_api.h") && + isRegularFile(includeDir/"vector_types.h"); +} + +void appendIncludeDir(SRuntimeCompileEnvironment& environment, system::path path, std::string source) +{ + if (path.empty() || !looksLikeCUDAIncludeDir(path)) + return; + + path = normalizedAbsolute(std::move(path)); + const auto pathString = path.generic_string(); + const auto alreadyAdded = std::find_if(environment.includeDirs.begin(),environment.includeDirs.end(),[&](const system::path& existing) { + return existing.generic_string()==pathString; + }); + if (alreadyAdded==environment.includeDirs.end()) + { + SRuntimeIncludeDir info; + info.path = path; + info.source = std::move(source); + info.cudaVersion = readCUDAVersion(path); + info.completeRuntimeHeaderSet = looksLikeCompleteRuntimeHeaderSet(path); + + environment.includeDirs.push_back(std::move(path)); + environment.includeDirInfos.push_back(std::move(info)); + } +} + +void appendCUDAIncludeDirsBelow(SRuntimeCompileEnvironment& environment, const system::path& root, uint32_t maxDepth, std::string source) +{ + if (!isDirectory(root)) + return; + + if (looksLikeCUDAIncludeDir(root)) + { + appendIncludeDir(environment,root,std::move(source)); + return; + } + if (maxDepth==0u) + return; + + core::vector candidates; + std::error_code error; + for (const auto& entry : std::filesystem::directory_iterator(root,error)) + { + if (error) + break; + + std::error_code entryError; + if (!entry.is_directory(entryError)) + continue; + candidates.push_back(entry.path()); + } + + std::sort(candidates.begin(),candidates.end(),[](const system::path& lhs, const system::path& rhs) { + return lhs.generic_string()>rhs.generic_string(); + }); + for (const auto& candidate : candidates) + appendCUDAIncludeDirsBelow(environment,candidate,maxDepth-1u,source); +} + +void appendCUDAIncludeRoot(SRuntimeCompileEnvironment& environment, const system::path& root, std::string source) +{ + if (root.empty()) + return; + + appendIncludeDir(environment,root,source); + appendIncludeDir(environment,root/"include",std::move(source)); +} + +void appendRuntimePathsConfig(SRuntimeCompileEnvironment& environment, const system::path& configFile, const char* source) +{ + if (!isRegularFile(configFile)) + return; + + std::ifstream input(configFile); + if (!input) + return; + + const auto json = nlohmann::json::parse(input,nullptr,false); + if (json.is_discarded()) + return; + + const auto paths = json.find("cudaRuntimeIncludeDirs"); + if (paths==json.end() || !paths->is_array()) + return; + + for (const auto& path : *paths) + if (path.is_string()) + appendIncludeDir(environment,system::path(path.get()),std::string(source)+": "+configFile.generic_string()); +} + +template +void appendPathListEnv(const char* name, Append append) +{ + const auto value = readEnvironmentVariable(name); + if (value.empty()) + return; + + size_t begin = 0; + while (begin& explicitRuntimePathFiles) +{ + for (const auto& runtimePathFile : explicitRuntimePathFiles) + appendRuntimePathsConfig(environment,runtimePathFile,"explicit runtime JSON"); + + appendPathListEnv("NBL_CUDA_INTEROP_RUNTIME_JSON",[&](const system::path& path) { + appendRuntimePathsConfig(environment,path,"NBL_CUDA_INTEROP_RUNTIME_JSON"); + }); + appendPathListEnv("Nabla_CUDA_INTEROP_RUNTIME_JSON",[&](const system::path& path) { + appendRuntimePathsConfig(environment,path,"Nabla_CUDA_INTEROP_RUNTIME_JSON"); + }); + + const auto exeDir = system::executableDirectory(); + if (!exeDir.empty()) + appendRuntimePathsConfig(environment,exeDir/RuntimePathsFileName,"executable-local runtime JSON"); +} + +void appendAppLocalIncludeDirs(SRuntimeCompileEnvironment& environment) +{ + const auto exeDir = system::executableDirectory(); + if (exeDir.empty()) + return; + + appendIncludeDir(environment,exeDir/"cuda"/"include","app-local cuda/include"); + appendCUDAIncludeDirsBelow(environment,exeDir/"nvidia",4u,"app-local nvidia package"); + appendIncludeDir(environment,exeDir/"Libraries"/"cuda"/"include","app-local Libraries/cuda/include"); + appendIncludeDir(environment,exeDir.parent_path()/"cuda"/"include","parent app-local cuda/include"); + appendCUDAIncludeDirsBelow(environment,exeDir.parent_path()/"nvidia",4u,"parent app-local nvidia package"); +} + +void appendPythonPackageIncludeDirs(SRuntimeCompileEnvironment& environment, const system::path& root, const char* source) +{ + if (root.empty()) + return; + + appendCUDAIncludeDirsBelow(environment,root/"Lib"/"site-packages"/"nvidia",4u,std::string(source)+" Python nvidia package"); + appendCUDAIncludeDirsBelow(environment,root/"lib"/"site-packages"/"nvidia",4u,std::string(source)+" Python nvidia package"); + appendIncludeDir(environment,root/"Library"/"include",std::string(source)+" Library/include"); + appendIncludeDir(environment,root/"include",std::string(source)+" include"); +} + +void appendEnvironmentIncludeDirs(SRuntimeCompileEnvironment& environment) +{ + appendPathListEnv("NBL_CUDA_RUNTIME_INCLUDE_DIRS",[&](const system::path& path) { + appendIncludeDir(environment,path,"NBL_CUDA_RUNTIME_INCLUDE_DIRS"); + }); + appendPathListEnv("Nabla_CUDA_RUNTIME_INCLUDE_DIRS",[&](const system::path& path) { + appendIncludeDir(environment,path,"Nabla_CUDA_RUNTIME_INCLUDE_DIRS"); + }); + + appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDA_PATH"),"CUDA_PATH"); + appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDA_HOME"),"CUDA_HOME"); + appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDA_ROOT"),"CUDA_ROOT"); + appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDAToolkit_ROOT"),"CUDAToolkit_ROOT"); + + appendPythonPackageIncludeDirs(environment,readEnvironmentVariable("VIRTUAL_ENV"),"VIRTUAL_ENV"); + appendPythonPackageIncludeDirs(environment,readEnvironmentVariable("CONDA_PREFIX"),"CONDA_PREFIX"); +} + +void appendCUDAInstallRoots(SRuntimeCompileEnvironment& environment, const system::path& root, const char* source) +{ + if (!isDirectory(root)) + return; + + core::vector candidates; + std::error_code error; + for (const auto& entry : std::filesystem::directory_iterator(root,error)) + { + if (error) + break; + if (!entry.is_directory(error)) + continue; + candidates.push_back(entry.path()/"include"); + } + + std::sort(candidates.begin(),candidates.end(),[](const system::path& lhs, const system::path& rhs) { + return lhs.generic_string()>rhs.generic_string(); + }); + for (const auto& candidate : candidates) + appendIncludeDir(environment,candidate,source); +} + +void appendSystemIncludeDirs(SRuntimeCompileEnvironment& environment) +{ + #if defined(_NBL_PLATFORM_WINDOWS_) + appendCUDAInstallRoots(environment,"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA","system CUDA Toolkit install root"); + #else + appendIncludeDir(environment,"/usr/local/cuda/include","system /usr/local/cuda"); + appendCUDAInstallRoots(environment,"/usr/local","system /usr/local CUDA install root"); + appendIncludeDir(environment,"/usr/include","system /usr/include"); + #endif +} + +} + +SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector& explicitIncludeDirs, const core::vector& runtimePathFiles) +{ + SRuntimeCompileEnvironment environment; + + /* + Runtime header discovery builds the ordered include list passed to NVRTC. It is not a lock to the CUDA SDK + used to build Nabla. A packaged Nabla must stay relocatable, so host-specific include paths are accepted + only when the application provides them at runtime: direct arguments, JSON next to the executable, an + override JSON, app-local header bundles, environment variables, or finally common toolkit install roots. + + The first root containing a requested header wins exactly like normal C/C++ include search. Keep every + accepted root with its source and parsed CUDA_VERSION so startup logs can explain what NVRTC will see. + This is also why mismatched or partial roots produce diagnostics instead of changing discovery order or + hard-failing before the user kernel is compiled. + */ + for (const auto& includeDir : explicitIncludeDirs) + appendIncludeDir(environment,includeDir,"explicit include dir"); + + appendRuntimePathsConfigs(environment,runtimePathFiles); + appendAppLocalIncludeDirs(environment); + appendEnvironmentIncludeDirs(environment); + appendSystemIncludeDirs(environment); + + return environment; +} + +SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector& explicitIncludeDirs) +{ + static const core::vector EmptyRuntimePathFiles; + return findRuntimeCompileEnvironment(explicitIncludeDirs,EmptyRuntimePathFiles); +} + +SRuntimeCompileEnvironment findRuntimeCompileEnvironment() +{ + static const core::vector EmptyIncludeDirs; + static const core::vector EmptyRuntimePathFiles; + return findRuntimeCompileEnvironment(EmptyIncludeDirs,EmptyRuntimePathFiles); +} + +} #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" +#include "nbl/system/CFileView.h" #include "jitify/jitify.hpp" namespace nbl::video { + +namespace +{ + +int cudaVersionMajor(int version) +{ + return version/1000; +} + +int cudaVersionMinor(int version) +{ + return (version%1000)/10; +} + +int cudaVersionCode(int major, int minor) +{ + return major*1000+minor*10; +} + +system::path loadedRuntimeModulePath(const char* moduleName) +{ + #if defined(_NBL_PLATFORM_WINDOWS_) + const auto moduleDir = system::loadedModuleDirectory(moduleName); + if (moduleDir.empty()) + return {}; + return moduleDir/(std::string(moduleName)+".dll"); + #else + return {}; + #endif +} + +std::string cudaVersionString(int version) +{ + std::ostringstream stream; + stream << cudaVersionMajor(version) << "." << cudaVersionMinor(version); + return stream.str(); +} + +std::string cudaVersionString(const cuda_interop::SRuntimeVersion& version) +{ + std::ostringstream stream; + stream << version[0] << "." << version[1]; + return stream.str(); +} + +std::string runtimeIncludeDirDescription(const cuda_interop::SRuntimeIncludeDir& includeDir) +{ + std::ostringstream stream; + stream << includeDir.path.generic_string() << " (" << includeDir.source; + if (includeDir.cudaVersion!=0u) + stream << ", CUDA_VERSION " << includeDir.cudaVersion << " / " << cudaVersionString(includeDir.cudaVersion); + else + stream << ", CUDA_VERSION unknown"; + if (!includeDir.completeRuntimeHeaderSet) + stream << ", partial header root"; + stream << ")"; + return stream.str(); +} + +std::string cudaRuntimeReport( + const int buildVersion, const int cudaDriverVersion, const system::path& cudaDriverPath, + const cuda_interop::SRuntimeVersion& nvrtcVersion, const std::string& nvrtcLibraryName, const system::path& nvrtcPath, + const cuda_interop::SRuntimeCompileEnvironment& runtimeEnvironment) +{ + std::ostringstream stream; + stream << "CCUDAHandler: CUDA interop runtime report:\n"; + stream << " - Nabla build CUDA SDK: " << cudaVersionString(buildVersion) << "\n"; + stream << " - CUDA Driver API: " << cudaVersionString(cudaDriverVersion); + if (!cudaDriverPath.empty()) + stream << " (" << cudaDriverPath.generic_string() << ")"; + stream << "\n"; + stream << " - NVRTC runtime: " << cudaVersionString(nvrtcVersion) << " (" << nvrtcLibraryName; + if (!nvrtcPath.empty()) + stream << ", " << nvrtcPath.generic_string(); + stream << ")\n"; + + if (runtimeEnvironment.includeDirs.empty()) + { + stream << " - NVRTC runtime header search path: none discovered"; + } + else + { + stream << " - Primary NVRTC runtime header path: " << runtimeIncludeDirDescription(runtimeEnvironment.includeDirInfos.front()) << "\n"; + stream << " - NVRTC runtime header search order (first path containing the requested header wins):\n"; + for (const auto& includeDir : runtimeEnvironment.includeDirInfos) + stream << " - " << runtimeIncludeDirDescription(includeDir) << "\n"; + } + return stream.str(); +} + +} CCUDAHandler::CCUDAHandler( - CUDA&& _cuda, - NVRTC&& _nvrtc, + std::unique_ptr&& nativeState, core::vector>&& _headers, - core::smart_refctd_ptr&& _logger, - int _version) - : m_cuda(std::move(_cuda)) - , m_nvrtc(std::move(_nvrtc)) + core::smart_refctd_ptr&& _logger) + : m_native(std::move(nativeState)) , m_headers(std::move(_headers)) , m_logger(std::move(_logger)) - , m_version(_version) { + assert(m_native); + for (auto& header : m_headers) { m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); m_headerNamesStorage.push_back(header->getFileName().string()); m_headerNames.push_back(m_headerNamesStorage.back().c_str()); } + for (const auto& option : m_native->runtimeIncludeOptions) + m_native->runtimeIncludeOptionPtrs.push_back(option.c_str()); int deviceCount = 0; - if (m_cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) + if (m_native->cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) return; for (int device_i = 0; device_i < deviceCount; device_i++) { CUdevice handle = -1; - if (m_cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0) + if (m_native->cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0) continue; CUuuid uuid = {}; - if (m_cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS) + if (m_native->cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS) continue; - m_availableDevices.emplace_back(handle, uuid); + auto& nativeDevice = m_native->deviceStates.emplace_back(); + nativeDevice.handle = handle; + nativeDevice.uuid = uuid; + auto& cleanDevice = m_availableDevices.emplace_back(); + memcpy(cleanDevice.uuid.data(),&uuid,cleanDevice.uuid.size()); - int* attributes = m_availableDevices.back().attributes; - for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++) - m_cuda.pcuDeviceGetAttribute(attributes + i, static_cast(i), handle); + for (size_t i = 0; i < nativeDevice.attributes.size(); i++) + m_native->cuda.pcuDeviceGetAttribute(&nativeDevice.attributes[i], static_cast(i), handle); } } -bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) +CCUDAHandler::~CCUDAHandler() = default; + +uint32_t CCUDAHandler::getBuildCUDASDKVersion() +{ + return CUDA_VERSION; +} + +uint32_t CCUDAHandler::getLoadedCUDADriverVersion() const +{ + return m_native->cudaDriverVersion; +} + +cuda_interop::SRuntimeVersion CCUDAHandler::getLoadedNVRTCVersion() const +{ + return m_native->nvrtcVersion; +} + +const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const +{ + return m_native->cuda; +} + +const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const { + return m_native->nvrtc; +} + +core::SRange CCUDAHandler::getDefaultRuntimeIncludeOptions() const +{ + if (m_native->runtimeIncludeOptionPtrs.empty()) + return {nullptr,nullptr}; + const auto* begin = m_native->runtimeIncludeOptionPtrs.data(); + return {begin,begin+m_native->runtimeIncludeOptionPtrs.size()}; +} + +bool CCUDAHandler::defaultHandleResult(cuda_interop::SCUresult opaqueResult, const system::logger_opt_ptr& logger) +{ + const CUresult result = opaqueResult; switch (result) { case CUDA_SUCCESS: @@ -176,6 +643,11 @@ bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt This indicates that a PTX JIT compilation failed. )===",system::ILogger::ELL_ERROR); break; + case CUDA_ERROR_UNSUPPORTED_PTX_VERSION: + logger.log(R"===(CCUDAHandler: + This indicates that the PTX version is unsupported by the CUDA driver. Check that the CUDA driver runtime can consume PTX produced by the loaded NVRTC runtime. + )===",system::ILogger::ELL_ERROR); + break; case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: logger.log(R"===(CCUDAHandler: This indicates an error with OpenGL or DirectX context. @@ -413,34 +885,54 @@ bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt break; case CUDA_ERROR_UNKNOWN: default: - logger.log("CCUDAHandler: Unknown CUDA Error!\n",system::ILogger::ELL_ERROR); + logger.log("CCUDAHandler: Unknown CUDA error code %d.",system::ILogger::ELL_ERROR,static_cast(result)); break; } - _NBL_DEBUG_BREAK_IF(true); return false; } -bool CCUDAHandler::defaultHandleResult(nvrtcResult result) +bool CCUDAHandler::defaultHandleResult(cuda_interop::SCUresult opaqueResult) const { + const CUresult result = opaqueResult; + if (result==CUDA_ERROR_UNSUPPORTED_PTX_VERSION) + { + const auto cudaVersion = getLoadedCUDADriverVersion(); + const auto nvrtcVersion = getLoadedNVRTCVersion(); + getLogger().log( + "CCUDAHandler: CUDA driver API %d.%d rejected PTX produced through NVRTC %d.%d. Install a newer NVIDIA driver or use an NVRTC/runtime-header set compatible with the installed driver.", + system::ILogger::ELL_ERROR, + cudaVersionMajor(cudaVersion),cudaVersionMinor(cudaVersion), + nvrtcVersion[0],nvrtcVersion[1] + ); + } + return defaultHandleResult(opaqueResult,getLogger()); +} + +bool CCUDAHandler::defaultHandleResult(cuda_interop::SNVRTCResult opaqueResult) const +{ + const nvrtcResult result = opaqueResult; + const auto& nvrtc = getNVRTCFunctionTable(); + const auto logger = getLogger(); switch (result) { case NVRTC_SUCCESS: return true; break; default: - if (m_nvrtc.pnvrtcGetErrorString) - m_logger.log("%s\n",system::ILogger::ELL_ERROR,m_nvrtc.pnvrtcGetErrorString(result)); + if (nvrtc.pnvrtcGetErrorString) + logger.log("%s\n",system::ILogger::ELL_ERROR,nvrtc.pnvrtcGetErrorString(result)); else - m_logger.log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR); + logger.log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR); break; } - _NBL_DEBUG_BREAK_IF(true); return false; } core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* system, core::smart_refctd_ptr&& _logger) { - CUDA cuda = CUDA( + const system::logger_opt_ptr logger(_logger.get()); + + cuda_native::CUDA cuda = cuda_native::CUDA( #if defined(_NBL_WINDOWS_API_) "nvcuda" #elif defined(_NBL_POSIX_API_) @@ -449,11 +941,71 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste #error "Unsuported Platform" #endif ); + + // need a complex safe calling chain because DLL/SO might not have loaded + #define SAFE_CUDA_CALL(FUNC,...) \ + {\ + if (!cuda.p ## FUNC)\ + {\ + logger.log("CCUDAHandler: CUDA Driver API function %s was not found. Need CUDA driver runtime %d.%d or newer.",system::ILogger::ELL_ERROR,#FUNC,cudaVersionMajor(cuda_native::MinimumCUDADriverVersion),cudaVersionMinor(cuda_native::MinimumCUDADriverVersion));\ + return nullptr;\ + }\ + auto result = cuda.p ## FUNC(__VA_ARGS__);\ + if (result!=CUDA_SUCCESS)\ + {\ + logger.log("CCUDAHandler: %s failed with CUDA error code %d.",system::ILogger::ELL_ERROR,#FUNC,static_cast(result));\ + return nullptr;\ + }\ + } - NVRTC nvrtc = {}; + SAFE_CUDA_CALL(cuInit,0) + + int cudaVersion = 0; + SAFE_CUDA_CALL(cuDriverGetVersion,&cudaVersion) + if (cudaVersion bool + { + if (!candidate.pnvrtcVersion) + return false; + + const auto result = candidate.pnvrtcVersion(version.data(),version.data()+1); + if (result==NVRTC_SUCCESS) + return true; + + logger.log("CCUDAHandler: nvrtcVersion failed for %s with NVRTC error code %d.",system::ILogger::ELL_WARNING,name,static_cast(result)); + version = {-1,-1}; + return false; + }; + + cuda_native::NVRTC nvrtc = {}; + cuda_interop::SRuntimeVersion nvrtcVersion = {-1,-1}; + std::string nvrtcLibraryName; + #if defined(_NBL_WINDOWS_API_) - // Perpetual TODO: any new CUDA releases we need to account for? - // Version List: https://developer.nvidia.com/cuda-toolkit-archive + cuda_native::NVRTC fallbackNVRTC = {}; + cuda_interop::SRuntimeVersion fallbackNVRTCVersion = {-1,-1}; + std::string fallbackNVRTCLibraryName; + + /* + The CUDA driver consumes the final PTX, not the toolkit that provided headers or nvrtc*.dll. + A real machine can have an older NVIDIA driver and a newer CUDA Toolkit side by side, for example + driver API 13.1 from nvcuda.dll with CUDA 13.2 Toolkit/NVRTC in CUDA_PATH. In that setup NVRTC can + emit PTX the installed driver rejects with CUDA_ERROR_UNSUPPORTED_PTX_VERSION. Prefer an NVRTC runtime + that is not newer than the loaded driver and log the full version matrix when no compatible one exists. + */ const char* nvrtc64_versions[] = { "nvrtc64_132", "nvrtc64_131", @@ -462,55 +1014,109 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste }; const char* nvrtc64_suffices[] = {"","_","_0","_1","_2",nullptr}; - for (auto verpath=nvrtc64_versions; *verpath; verpath++) + for (auto verpath=nvrtc64_versions; *verpath && !nvrtc.pnvrtcVersion; verpath++) { for (auto suffix=nvrtc64_suffices; *suffix; suffix++) { - std::string path(*verpath); - path += *suffix; - nvrtc = NVRTC(path.c_str()); - if (nvrtc.pnvrtcVersion) + std::string candidateName(*verpath); + candidateName += *suffix; + + cuda_native::NVRTC candidate(candidateName.c_str()); + cuda_interop::SRuntimeVersion candidateVersion = {-1,-1}; + if (!readNVRTCVersion(candidate,candidateVersion,candidateName.c_str())) + continue; + + if (cudaVersionCode(candidateVersion[0],candidateVersion[1])<=cudaVersion) + { + nvrtc = std::move(candidate); + nvrtcVersion = candidateVersion; + nvrtcLibraryName = std::move(candidateName); break; + } + + if (!fallbackNVRTC.pnvrtcVersion) + { + fallbackNVRTC = std::move(candidate); + fallbackNVRTCVersion = candidateVersion; + fallbackNVRTCLibraryName = std::move(candidateName); + } } - if (nvrtc.pnvrtcVersion) - break; + } + + if (!nvrtc.pnvrtcVersion && fallbackNVRTC.pnvrtcVersion) + { + nvrtc = std::move(fallbackNVRTC); + nvrtcVersion = fallbackNVRTCVersion; + nvrtcLibraryName = std::move(fallbackNVRTCLibraryName); } #elif defined(_NBL_POSIX_API_) - nvrtc = NVRTC("nvrtc"); - //nvrtc_builtins = NVRTC("nvrtc-builtins"); + nvrtcLibraryName = "nvrtc"; + nvrtc = cuda_native::NVRTC(nvrtcLibraryName.c_str()); + readNVRTCVersion(nvrtc,nvrtcVersion,nvrtcLibraryName.c_str()); #else #error "Unsuported Platform" #endif - - // need a complex safe calling chain because DLL/SO might not have loaded - #define SAFE_CUDA_CALL(FUNC,...) \ - {\ - if (!cuda.p ## FUNC)\ - return nullptr;\ - auto result = cuda.p ## FUNC ## (__VA_ARGS__);\ - if (result!=CUDA_SUCCESS)\ - return nullptr;\ + // check nvrtc existence and compatibility + if (!nvrtc.pnvrtcVersion) + { + logger.log("CCUDAHandler: NVRTC runtime was not found. Need NVRTC %d.x or newer.",system::ILogger::ELL_ERROR,cuda_native::MinimumNVRTCMajorVersion); + return nullptr; } - - SAFE_CUDA_CALL(cuInit,0) - - int cudaVersion = 0; - SAFE_CUDA_CALL(cuDriverGetVersion,&cudaVersion) - if (cudaVersion<13000) + if (nvrtcVersion[0]cudaVersion) + { + logger.log( + "CCUDAHandler: NVRTC runtime %d.%d is newer than CUDA driver API %d.%d. PTX generated by this NVRTC may be unsupported by the installed driver.", + system::ILogger::ELL_WARNING, + nvrtcVersion[0],nvrtcVersion[1], + cudaVersionMajor(cudaVersion),cudaVersionMinor(cudaVersion) + ); + } + if (runtimeEnvironment.includeDirs.empty()) + { + logger.log("CCUDAHandler: no CUDA runtime headers were discovered for NVRTC include paths.",system::ILogger::ELL_WARNING); + } + else + { + const auto& primaryIncludeDir = runtimeEnvironment.includeDirInfos.front(); + if (!primaryIncludeDir.completeRuntimeHeaderSet) + { + logger.log( + "CCUDAHandler: primary NVRTC runtime header path %s does not contain cuda.h, cuda_runtime_api.h, and vector_types.h together. NVRTC may use later include paths for missing headers.", + system::ILogger::ELL_WARNING, + primaryIncludeDir.path.generic_string().c_str() + ); + } - // check nvrtc existence and compatibility - if (!nvrtc.pnvrtcVersion) - return nullptr; - int nvrtcVersion[2] = { -1,-1 }; - nvrtc.pnvrtcVersion(nvrtcVersion+0,nvrtcVersion+1); - if (nvrtcVersion[0]<9) - return nullptr; + const auto nvrtcVersionCode = cudaVersionCode(nvrtcVersion[0],nvrtcVersion[1]); + if (primaryIncludeDir.cudaVersion!=0u && primaryIncludeDir.cudaVersion!=static_cast(nvrtcVersionCode)) + { + logger.log( + "CCUDAHandler: primary NVRTC runtime headers report CUDA_VERSION %u (%s), while loaded NVRTC is %s. This is allowed by discovery policy, but kernels using version-specific CUDA headers may fail to compile.", + system::ILogger::ELL_WARNING, + primaryIncludeDir.cudaVersion, + cudaVersionString(primaryIncludeDir.cudaVersion).c_str(), + cudaVersionString(nvrtcVersion).c_str() + ); + } + } // add headers core::vector> headers; @@ -520,16 +1126,18 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste headers.push_back(core::make_smart_refctd_ptr>( it.first.c_str(), core::bitflag(system::IFile::ECF_READ)|system::IFile::ECF_MAPPABLE, - // ASK(kevin): What initial_modified_time should I use? Is this how this parameter is used? std::chrono::clock_cast(std::chrono::system_clock::now()), const_cast(contents),it.second.size()+1u )); } - return core::make_smart_refctd_ptr(std::move(cuda),std::move(nvrtc), std::move(headers), std::move(_logger), cudaVersion); + return core::smart_refctd_ptr( + new CCUDAHandler(std::make_unique(std::move(cuda),std::move(nvrtc),cudaVersion,nvrtcVersion,std::move(runtimeEnvironment)),std::move(headers),std::move(_logger)), + core::dont_grab + ); } -nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) +cuda_interop::SNVRTCResult CCUDAHandler::createProgram(cuda_interop::SOutput prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) { #if defined(_NBL_WINDOWS_API_) source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n"); @@ -538,26 +1146,40 @@ nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source #else #error "Unsuported Platform" #endif - return m_nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames); + nvrtcProgram nativeProgram = nullptr; + const auto result = getNVRTCFunctionTable().pnvrtcCreateProgram(&nativeProgram,source.c_str(),name,headerCount,headerContents,includeNames); + if (prog) + *prog = nativeProgram; + return result; } -nvrtcResult CCUDAHandler::getProgramLog(nvrtcProgram prog, std::string& log) +cuda_interop::SNVRTCResult CCUDAHandler::compileProgram(cuda_interop::SNVRTCProgram prog, core::SRange options) const +{ + const nvrtcProgram nativeProgram = prog; + return getNVRTCFunctionTable().pnvrtcCompileProgram(nativeProgram,options.size(),options.begin()); +} + +cuda_interop::SNVRTCResult CCUDAHandler::getProgramLog(cuda_interop::SNVRTCProgram prog, std::string& log) const { size_t _size = 0ull; - nvrtcResult sizeRes = m_nvrtc.pnvrtcGetProgramLogSize(prog, &_size); + const nvrtcProgram nativeProgram = prog; + const auto& nvrtc = getNVRTCFunctionTable(); + nvrtcResult sizeRes = nvrtc.pnvrtcGetProgramLogSize(nativeProgram, &_size); if (sizeRes != NVRTC_SUCCESS) return sizeRes; if (_size == 0ull) return NVRTC_ERROR_INVALID_INPUT; log.resize(_size); - return m_nvrtc.pnvrtcGetProgramLog(prog,log.data()); + return nvrtc.pnvrtcGetProgramLog(nativeProgram,log.data()); } -CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) +CCUDAHandler::SPTXResult CCUDAHandler::getPTX(cuda_interop::SNVRTCProgram prog) const { size_t _size = 0ull; - nvrtcResult sizeRes = m_nvrtc.pnvrtcGetPTXSize(prog,&_size); + const nvrtcProgram nativeProgram = prog; + const auto& nvrtc = getNVRTCFunctionTable(); + nvrtcResult sizeRes = nvrtc.pnvrtcGetPTXSize(nativeProgram,&_size); if (sizeRes!=NVRTC_SUCCESS) return {nullptr,sizeRes}; if (_size==0ull) @@ -567,7 +1189,51 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) ptxParams.size = _size; auto ptx = asset::ICPUBuffer::create(std::move(ptxParams)); auto ptxPtr = static_cast(ptx->getPointer()); - return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; + return {std::move(ptx),nvrtc.pnvrtcGetPTX(nativeProgram,ptxPtr)}; +} + +static CCUDAHandler::SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, cuda_interop::SNVRTCResult result, cuda_interop::SNVRTCProgram program, core::SRange nvrtcOptions, std::string* log) +{ + if (log) + log->clear(); + const nvrtcResult nativeResult = result; + if (nativeResult!=NVRTC_SUCCESS) + return {nullptr,result}; + + const auto runtimeIncludeOptions = handler.getDefaultRuntimeIncludeOptions(); + core::vector options; + options.reserve(nvrtcOptions.size()+runtimeIncludeOptions.size()); + for (const auto option : nvrtcOptions) + options.push_back(option); + for (const auto option : runtimeIncludeOptions) + options.push_back(option); + + const auto* optionsBegin = options.empty() ? nullptr:options.data(); + const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size(); + result = handler.compileProgram(program,{optionsBegin,optionsEnd}); + if (log) + handler.getProgramLog(program,*log); + if (static_cast(result)!=NVRTC_SUCCESS) + return {nullptr,result}; + + return handler.getPTX(program); +} + +CCUDAHandler::SPTXResult CCUDAHandler::compileDirectlyToPTX( + std::string&& source, const char* filename, core::SRange nvrtcOptions, + std::string* log, const int headerCount, const char* const* headerContents, const char* const* includeNames) +{ + cuda_interop::SNVRTCProgram program = {}; + cuda_interop::SNVRTCResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; + auto cleanup = core::makeRAIIExiter([&]() -> void + { + nvrtcProgram nativeProgram = program; + if (nativeProgram) + getNVRTCFunctionTable().pnvrtcDestroyProgram(&nativeProgram); + }); + + result = createProgram(program,std::move(source),filename,headerCount,headerContents,includeNames); + return compileDirectlyToPTX_impl(*this,result,program,nvrtcOptions,log); } core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice) @@ -578,7 +1244,7 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - for (const auto& device : m_availableDevices) + for (const auto& device : m_native->deviceStates) { if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { @@ -662,7 +1328,13 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - return core::make_smart_refctd_ptr(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)); + auto cudaDevice = core::smart_refctd_ptr( + new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique(device.handle),core::smart_refctd_ptr(this)), + core::dont_grab + ); + if (!cudaDevice->isValid()) + return nullptr; + return std::move(cudaDevice); } } return nullptr; @@ -670,4 +1342,67 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct } +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAHandler::SNativeState {}; + +CCUDAHandler::CCUDAHandler( + std::unique_ptr&& nativeState, + core::vector>&& _headers, + core::smart_refctd_ptr&& _logger) + : m_native(std::move(nativeState)) + , m_headers(std::move(_headers)) + , m_logger(std::move(_logger)) +{ + assert(m_native); +} + +CCUDAHandler::~CCUDAHandler() = default; + +uint32_t CCUDAHandler::getBuildCUDASDKVersion() +{ + return 0u; +} + +uint32_t CCUDAHandler::getLoadedCUDADriverVersion() const +{ + return 0u; +} + +cuda_interop::SRuntimeVersion CCUDAHandler::getLoadedNVRTCVersion() const +{ + return {-1,-1}; +} + +const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const +{ + std::abort(); +} + +const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const +{ + std::abort(); +} + +core::SRange CCUDAHandler::getDefaultRuntimeIncludeOptions() const +{ + return {nullptr,nullptr}; +} + +core::smart_refctd_ptr CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&&, IPhysicalDevice*) +{ + return nullptr; +} + +} + #endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp index 7e21b05ef1..ec5438643f 100644 --- a/src/nbl/video/CCUDAImportedMemory.cpp +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -2,31 +2,82 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDAImportedMemory.h" -#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" namespace nbl::video { -CUresult CCUDAImportedMemory::getMappedBuffer(CUdeviceptr* mappedBuffer) +CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) { - CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {}; - bufferDesc.offset = 0; - bufferDesc.size = m_src->getAllocationSize(); + assert(m_native); +} - auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, m_handle, &bufferDesc); - +cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const +{ + return m_native->handle; +} + +bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SOutput mappedBuffer) const +{ + if (!mappedBuffer) + return false; + + CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {}; + bufferDesc.offset = 0; + bufferDesc.size = m_src->getAllocationSize(); + + CUdeviceptr nativeMappedBuffer = 0; + const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + const auto result = cu.pcuExternalMemoryGetMappedBuffer(&nativeMappedBuffer, m_native->handle, &bufferDesc); + if (!m_device->getHandler()->defaultHandleResult(result)) + return false; + + *mappedBuffer = nativeMappedBuffer; + return true; } CCUDAImportedMemory::~CCUDAImportedMemory() { - auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_handle), m_device->getHandler()); + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + m_device->getHandler()->defaultHandleResult(cu.pcuDestroyExternalMemory(m_native->handle)); +} + +} + +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAImportedMemory::SNativeState {}; + +CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +CCUDAImportedMemory::~CCUDAImportedMemory() = default; + +cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const +{ + return {}; +} + +bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SOutput) const +{ + return false; } } -#endif \ No newline at end of file +#endif diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp index 0dc750a4a9..49495e11e2 100644 --- a/src/nbl/video/CCUDAImportedSemaphore.cpp +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -2,17 +2,56 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDAImportedSemaphore.h" -#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" + namespace nbl::video { +CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +cuda_interop::SCUexternalSemaphore CCUDAImportedSemaphore::getInternalObject() const +{ + return m_native->handle; +} + CCUDAImportedSemaphore::~CCUDAImportedSemaphore() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_handle), m_device->getHandler()); + m_device->getHandler()->defaultHandleResult(cu.pcuDestroyExternalSemaphore(m_native->handle)); +} } + +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAImportedSemaphore::SNativeState {}; + +CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default; + +cuda_interop::SCUexternalSemaphore CCUDAImportedSemaphore::getInternalObject() const +{ + return {}; +} + } -#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file +#endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp new file mode 100644 index 0000000000..04a70c6e4e --- /dev/null +++ b/src/nbl/video/CUDAInteropNativeState.hpp @@ -0,0 +1,81 @@ +#ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ + +#include "nbl/video/CUDAInteropNativeAPI.h" + +#include +#include + +namespace nbl::video +{ + +struct CCUDAHandler::SNativeState +{ + struct SDeviceState + { + CUdevice handle = {}; + CUuuid uuid = {}; + std::array attributes = {}; + }; + + cuda_native::CUDA cuda; + cuda_native::NVRTC nvrtc; + int cudaDriverVersion = 0; + cuda_interop::SRuntimeVersion nvrtcVersion = {-1,-1}; + // Snapshot discovery at handler creation so diagnostics and NVRTC compile options describe the same runtime setup. + cuda_interop::SRuntimeCompileEnvironment runtimeEnvironment; + core::vector runtimeIncludeOptions; + core::vector runtimeIncludeOptionPtrs; + core::vector deviceStates; + + SNativeState( + cuda_native::CUDA&& _cuda, + cuda_native::NVRTC&& _nvrtc, + int _cudaDriverVersion, + cuda_interop::SRuntimeVersion _nvrtcVersion, + cuda_interop::SRuntimeCompileEnvironment&& _runtimeEnvironment) + : cuda(std::move(_cuda)) + , nvrtc(std::move(_nvrtc)) + , cudaDriverVersion(_cudaDriverVersion) + , nvrtcVersion(_nvrtcVersion) + , runtimeEnvironment(std::move(_runtimeEnvironment)) + , runtimeIncludeOptions(cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment)) + {} +}; + +struct CCUDADevice::SNativeState +{ + CUdevice handle = {}; + CUcontext context = nullptr; + + explicit SNativeState(CUdevice _handle) + : handle(_handle) + {} +}; + +struct CCUDAExportableMemory::SNativeState +{ + CUdeviceptr ptr = 0; +}; + +struct CCUDAImportedMemory::SNativeState +{ + CUexternalMemory handle = nullptr; + + explicit SNativeState(CUexternalMemory _handle) + : handle(_handle) + {} +}; + +struct CCUDAImportedSemaphore::SNativeState +{ + CUexternalSemaphore handle = nullptr; + + explicit SNativeState(CUexternalSemaphore _handle) + : handle(_handle) + {} +}; + +} + +#endif