Move CUDA interop behind native opt-in#1067
Move CUDA interop behind native opt-in#1067AnastaZIuk wants to merge 27 commits intovk_cuda_interopfrom
Conversation
| // Opt-in native CUDA API. The declarations below are implemented by the Nabla library. | ||
| // This header is intentionally the only public path that includes CUDA SDK types. | ||
| class NBL_API2 CCUDAHandlerAccessor | ||
| { | ||
| public: | ||
| static const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); | ||
| static const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); | ||
| static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); | ||
| static bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); | ||
| static bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); | ||
| static const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler); | ||
| static nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); | ||
| static nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options); | ||
| static nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); | ||
| static SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog); | ||
| static SPTXResult compileDirectlyToPTX( | ||
| CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions, | ||
| std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr | ||
| ); | ||
| }; | ||
|
|
||
| class NBL_API2 CCUDADeviceAccessor | ||
| { | ||
| public: | ||
| static CUdevice getInternalObject(const CCUDADevice& device); | ||
| static CUcontext getContext(const CCUDADevice& device); | ||
| static size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size); | ||
| static core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params); | ||
| }; | ||
|
|
||
| class NBL_API2 CCUDAExportableMemoryAccessor | ||
| { | ||
| public: | ||
| static CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory); | ||
| }; | ||
|
|
||
| class NBL_API2 CCUDAImportedMemoryAccessor | ||
| { | ||
| public: | ||
| static CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory); | ||
| static CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer); | ||
| }; | ||
|
|
||
| class NBL_API2 CCUDAImportedSemaphoreAccessor | ||
| { | ||
| public: | ||
| static CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore); | ||
| }; |
There was a problem hiding this comment.
accessors make no sense just move all the nbl/video/CCUDA*.h to the extension
| #define ASSERT_CUDA_SUCCESS(expr, handler) \ | ||
| do { \ | ||
| const auto cudaResult = (expr); \ | ||
| if (!((handler)->defaultHandleResult(cudaResult))) { \ | ||
| assert(false); \ | ||
| } \ | ||
| } while(0) | ||
|
|
There was a problem hiding this comment.
that macro was useful just needs a rename
| inline bool CloseExternalHandle(external_handle_t handle) | ||
| { | ||
| #ifdef _WIN32 | ||
| return CloseHandle(handle); | ||
| #else | ||
| return (close(handle) == 0); | ||
| #endif | ||
| } | ||
|
|
||
| inline external_handle_t DuplicateExternalHandle(external_handle_t handle) | ||
| { | ||
| #ifdef _WIN32 | ||
| HANDLE re = ExternalHandleNull; | ||
|
|
||
| const HANDLE cur = GetCurrentProcess(); | ||
| if (!DuplicateHandle(cur, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) | ||
| return ExternalHandleNull; | ||
|
|
||
| return re; | ||
| #else | ||
| return dup(handle); | ||
| #endif |
There was a problem hiding this comment.
you may want to keep that inline, these are OS calls, and when they're inline they'll work BEFORE Nabla.dll is delay loaded, which is useful
| #include "nbl/video/CUDAInterop.h" | ||
| #include "nbl/system/IApplicationFramework.h" | ||
|
|
||
| #include <type_traits> | ||
|
|
||
| #ifdef _NBL_COMPILE_WITH_CUDA_ | ||
| #error "Nabla::Nabla must not propagate the CUDA build define." | ||
| #endif | ||
|
|
||
| #ifdef CUDA_VERSION | ||
| #error "Nabla::Nabla must not require CUDA SDK headers." | ||
| #endif | ||
|
|
||
| namespace | ||
| { | ||
|
|
||
| class CUDAInteropCleanOptInSmoke final : public nbl::system::IApplicationFramework | ||
| { | ||
| using base_t = nbl::system::IApplicationFramework; | ||
|
|
||
| public: | ||
| using base_t::base_t; | ||
|
|
||
| bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override | ||
| { | ||
| static_assert(std::is_class_v<nbl::video::CCUDADevice>); | ||
| static_assert(std::is_class_v<nbl::video::CCUDAExportableMemory>); | ||
| static_assert(std::is_class_v<nbl::video::CCUDAImportedMemory>); | ||
| static_assert(std::is_class_v<nbl::video::CCUDAImportedSemaphore>); |
There was a problem hiding this comment.
it would make more sense to not have anything CUDA related in Nabla itself
| const auto& granularity = SAccess::native(device).allocationGranularity[location]; | ||
| return ((size - 1) / granularity + 1) * granularity; | ||
| } | ||
|
|
There was a problem hiding this comment.
I mentioned in the original PR, this should be inline
| if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size))) | ||
| assert(false); | ||
| return err; | ||
| } | ||
|
|
||
| CUmemAccessDesc accessDesc = { | ||
| .location = { .type = location, .id = m_handle }, | ||
| .location = { .type = location, .id = native.handle }, | ||
| .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, | ||
| }; | ||
|
|
||
| if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) | ||
| { | ||
| ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), m_handler); | ||
| ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler); | ||
| if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size))) | ||
| assert(false); | ||
| if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size))) | ||
| assert(false); |
There was a problem hiding this comment.
@kevyuu thinking of it we shouldn't crash an entire program vecause of failure here :s
CUDA interop extension ABIAs discussed on Discord, the proposal is to move all CUDA interop code into a plugin as Nabla extension and expose CUDA directly from that target's public headers. We can do that. It protects The goal is not to wrap CUDA. The goal is to keep CUDA SDK-defined layout out of the public class layout. Proposed Plugin ShapeOriginal PR: #1061 Let's say we move the original PR shape into the extension target. In other words, the CUDA interop code no longer lives in The exact code shape from the last commit currently on
If we move this shape into A consumer then compiles those public extension headers again with its own CUDA SDK. Inside our current Nabla examples build this may look fine. The examples are configured in the same build interface as Nabla and the CUDA interop target, so they naturally see the same But that is one project setup, not a general CMake guarantee. Another project can consume Nabla through With an installed package this becomes normal downstream usage. Our package config does the same kind of CUDA discovery on the consumer side. It accepts Nabla/cmake/NablaConfig.cmake.in Lines 88 to 100 in ffba3d4 So the extension binary may have been built with CUDA SDK A, while the downstream project resolves CUDA SDK B through its own package configure. We should not force that downstream SDK to be byte-for-byte the same SDK installed on the host that built the package. If the public extension headers contain SDK-defined layout, the downstream translation units compile that layout from SDK B while the already-built extension binary was compiled with SDK A. Actual RiskRaw CUDA handles like The risky part is exposing SDK-defined layout and inline access through the public extension headers.
Consequences
I decided to check this directly and installed multiple CUDA SDK versions locally.
But The point is not that we want to support CUDA 12.x for this feature. The point is that NVIDIA already changed this value across minor releases inside the same major line. CUDA 12.1 has 133 and CUDA 12.5 has 136. Requiring CUDA 13.x does not show that CUDA 13.0, 13.1, 13.2, or a later 13.x SDK will keep every public-layout macro and generated function table shape identical. So the same public struct has different sizes. If the extension binary fills
A DLL moves implementation into another binary, but the consumer still compiles inline methods, templates, member offsets, and The DLL cannot fix the fact that the consumer and the DLL disagree on the vector element type layout. A static library has the same issue unless it is rebuilt from source with the exact same CUDA SDK as the consumer.
In the original header, Nabla's loader macro expands every listed function into a typed member. That means the function pointer type is taken directly from the CUDA/NVRTC header used by the current translation unit. Local CUDA headers show version-sensitive API surface. If the generated loader tables are public members of
Updating the CUDA SDK path, version, or package can invalidate every translation unit that includes those headers. That is true even when the consumer only wants
With CUDA SDK layout in public extension classes, the true statement is this. It is not this. Build-tree examples do not show the stronger statement. They show our current single-configure examples setup where Nabla, the extension, and the examples share the same CUDA SDK discovery. Other build-interface setups, Accessor ModelThe public Nabla CUDA object layout stays SDK-free. CUDA-dependent layout stays private to the binary. Native CUDA access is still explicit opt-in. So the plugin as Nabla extension model is workable. If it exposes the original CUDA-heavy The accessor/native-state approach is not a CUDA wrapper. It is a small glue layer between Nabla objects and the CUDA world, only for the interop objects that cross that boundary. Raw CUDA is still available through the explicit opt-in header. TLDRYes, we can move all CUDA interop code into Then we explicitly accept that This can pass cleanly in our current examples build because Nabla, the extension, and examples are configured together. That does not cover every build-interface setup, The tradeoff is:
We can move it into a plugin as Nabla extension if that is the preferred direction. My subjective take is that I do not like that model very much, but we can do it as long as we are explicit that this accepts the SDK-dependent public ABI and the potential mismatch, rebuild, and package issues described above. |
|
Excellent report, well presented. I can see the problem now. One thing to note that newer CUDA SDK versions are not allowed to change handle definitions like CUDeviceptr and external semaphore, otherwise app compiled against lower version SDK wouldn't run with a newer runtime.
Same thing for function signatures like cuCtx, this is why they have v2 and even v4. They cannot change the signature because CUDA Driver (not runtime) API is dynamically linked and you must be able.to run with later driver update.
What you've uncovered here is the reason why we require the SDK version to be >=13.0 because one of the function pointers in the table can only be loaded from that SDK. Although I don't intend to provide a guarantee that every function pointer will be loaded and available, it's meant to be a quickly hacked together low effort Volk but for CUDA. Except with no global C functions or state. The I think there's a middle ground compromise we can come to |
|
Ok lets split into two targets with optional leaking headers But instead of accessors lets abuse implicit conversions and constructors struct alignas(8) OpaqueCUdeviceptr
{
uint8_t value[8];
};
//===================================LEAK BOUNDARY========================================
// CUDA SDK DEP now
//#include CUDA
using CUdeviceptr = void*;
// trait
template<typename T>
struct opaque_cuda_type;
template<>
struct opaque_cuda_type<OpaqueCUdeviceptr>
{
using type = CUdeviceptr;
};
template<typename Opaque>
struct Wrapped
{
using cuda_t = typename opaque_cuda_type<Opaque>::type;
static_assert(std::is_trivial_v<cuda_t>);
static_assert(sizeof(Opaque)==sizeof(cuda_t));
static_assert(alignof(Opaque)==alignof(cuda_t));
// constucotrs
Wrapped() = default;
Wrapped(const Wrapped&) = default;
Wrapped(const cuda_t& val) {operator=(val);}
Wrapped(const Opaque& val) {operator=(val);}
// assignment
inline Wrapped& operator=(const Wrapped&) = default;
inline Wrapped& operator=(const cuda_t& val) {value = val; return *this;}
inline Wrapped& operator=(const Opaque& val) {operator Opaque&() = val; return *this;}
// immplicit convert to original value reference
inline operator cuda_t&() {return value;}
inline operator const cuda_t&() const {return value;}
// implicit conver to opaque handle reference
inline operator Opaque&() {return reinterpret_cast<Opaque&>(value);}
inline operator const Opaque&() const {return reinterpret_cast<const Opaque&>(value);}
cuda_t value = {};
};
// Type your code here, or load an example.
CUdeviceptr pCudaCall(CUdeviceptr);
CUdeviceptr square(int num)
{
OpaqueCUdeviceptr op = {};
Wrapped<OpaqueCUdeviceptr> wp(op);
wp = op;
wp = pCudaCall(wp);
op = wp;
return wp;
} |
Moves CUDA interop behind SDK-free Nabla headers with explicit
Nabla::ext::CUDAInteropnative opt-in. Keeps raw CUDA/NVRTC access available for consumers that ask for native opt-in while avoiding default public SDK requirements.