Move CUDA interop behind native opt-in by AnastaZIuk · Pull Request #1067 · Devsh-Graphics-Programming/Nabla

AnastaZIuk · 2026-05-07T15:58:44Z

Moves CUDA interop behind SDK-free Nabla headers with explicit Nabla::ext::CUDAInterop native opt-in. Keeps raw CUDA/NVRTC access available for consumers that ask for native opt-in while avoiding default public SDK requirements.

devshgraphicsprogramming · 2026-05-07T16:24:26Z

+// Opt-in native CUDA API. The declarations below are implemented by the Nabla library.
+// This header is intentionally the only public path that includes CUDA SDK types.
+class NBL_API2 CCUDAHandlerAccessor
+{
+	public:
+		static const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
+		static const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
+		static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
+		static bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
+		static bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
+		static const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
+		static nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+		static nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
+		static nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
+		static SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
+		static SPTXResult compileDirectlyToPTX(
+			CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+			std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
+		);
+};
+
+class NBL_API2 CCUDADeviceAccessor
+{
+	public:
+		static CUdevice getInternalObject(const CCUDADevice& device);
+		static CUcontext getContext(const CCUDADevice& device);
+		static size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
+		static core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params);
+};
+
+class NBL_API2 CCUDAExportableMemoryAccessor
+{
+	public:
+		static CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
+};
+
+class NBL_API2 CCUDAImportedMemoryAccessor
+{
+	public:
+		static CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
+		static CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
+};
+
+class NBL_API2 CCUDAImportedSemaphoreAccessor
+{
+	public:
+		static CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
+};


accessors make no sense just move all the nbl/video/CCUDA*.h to the extension

devshgraphicsprogramming · 2026-05-07T16:25:27Z

-#define ASSERT_CUDA_SUCCESS(expr, handler) \
-    do { \
-        const auto cudaResult = (expr); \
-        if (!((handler)->defaultHandleResult(cudaResult))) { \
-            assert(false); \
-        } \
-    } while(0)
-


that macro was useful just needs a rename

devshgraphicsprogramming · 2026-05-07T16:27:50Z

-inline bool CloseExternalHandle(external_handle_t handle)
-{
-#ifdef _WIN32
-    return CloseHandle(handle);
-#else
-    return (close(handle) == 0);
-#endif
-}
-
-inline external_handle_t DuplicateExternalHandle(external_handle_t handle)
-{
-#ifdef _WIN32
-    HANDLE re = ExternalHandleNull;
-
-    const HANDLE cur = GetCurrentProcess();
-    if (!DuplicateHandle(cur, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS))
-        return ExternalHandleNull;
-
-    return re;
-#else
-    return dup(handle);
-#endif


you may want to keep that inline, these are OS calls, and when they're inline they'll work BEFORE Nabla.dll is delay loaded, which is useful

devshgraphicsprogramming · 2026-05-07T16:31:07Z

+#include "nbl/video/CUDAInterop.h"
+#include "nbl/system/IApplicationFramework.h"
+
+#include <type_traits>
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
+#error "Nabla::Nabla must not propagate the CUDA build define."
+#endif
+
+#ifdef CUDA_VERSION
+#error "Nabla::Nabla must not require CUDA SDK headers."
+#endif
+
+namespace
+{
+
+class CUDAInteropCleanOptInSmoke final : public nbl::system::IApplicationFramework
+{
+	using base_t = nbl::system::IApplicationFramework;
+
+public:
+	using base_t::base_t;
+
+	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override
+	{
+		static_assert(std::is_class_v<nbl::video::CCUDADevice>);
+		static_assert(std::is_class_v<nbl::video::CCUDAExportableMemory>);
+		static_assert(std::is_class_v<nbl::video::CCUDAImportedMemory>);
+		static_assert(std::is_class_v<nbl::video::CCUDAImportedSemaphore>);


it would make more sense to not have anything CUDA related in Nabla itself

devshgraphicsprogramming · 2026-05-07T16:34:42Z

+	const auto& granularity = SAccess::native(device).allocationGranularity[location];
+	return ((size - 1) / granularity + 1) * granularity;
+}
+


I mentioned in the original PR, this should be inline

devshgraphicsprogramming · 2026-05-07T16:35:43Z

+		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
+			assert(false);
 		return err;
 	}

 	CUmemAccessDesc accessDesc = {
-		.location = { .type = location, .id = m_handle },
+		.location = { .type = location, .id = native.handle },
 		.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE,
 	};

 	if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err)
 	{
-		ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), m_handler);
-		ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler);
+		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size)))
+			assert(false);
+		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
+			assert(false);


@kevyuu thinking of it we shouldn't crash an entire program vecause of failure here :s

AnastaZIuk · 2026-05-08T08:45:14Z

CUDA interop extension ABI

As discussed on Discord, the proposal is to move all CUDA interop code into a plugin as Nabla extension and expose CUDA directly from that target's public headers.

We can do that. It protects Nabla::Nabla from the CUDA SDK leak. It does not remove the boundary problem though. It moves that problem to Nabla::ext::CUDAInterop, and then the extension target owns the ABI and versioning consequences.

The goal is not to wrap CUDA. The goal is to keep CUDA SDK-defined layout out of the public class layout.

Proposed Plugin Shape

Original PR: #1061

Let's say we move the original PR shape into the extension target. In other words, the CUDA interop code no longer lives in Nabla::Nabla, but the public CUDA-native headers stay effectively the same and move from include/nbl/video/CCUDA*.h to include/nbl/ext/CUDAInterop/CCUDA*.h.

The exact code shape from the last commit currently on origin/vk_cuda_interop is here:

CCUDAHandler.h exposes the CUDA and NVRTC loader table types plus inline getters:

Nabla/include/nbl/video/CCUDAHandler.h

Lines 43 to 159 in 0d237c0

    
           NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader 
        
           	,cuCtxCreate_v4 
        
           	,cuDevicePrimaryCtxRetain 
        
           	,cuDevicePrimaryCtxRelease 
        
           	,cuDevicePrimaryCtxSetFlags 
        
           	,cuDevicePrimaryCtxGetState 
        
           	,cuCtxDestroy_v2 
        
           	,cuCtxEnablePeerAccess 
        
           	,cuCtxGetApiVersion 
        
           	,cuCtxGetCurrent 
        
           	,cuCtxGetDevice 
        
           	,cuCtxGetSharedMemConfig 
        
           	,cuCtxPopCurrent_v2 
        
           	,cuCtxPushCurrent_v2 
        
           	,cuCtxSetCacheConfig 
        
           	,cuCtxSetCurrent 
        
           	,cuCtxSetSharedMemConfig 
        
           	,cuCtxSynchronize 
        
           	,cuDeviceComputeCapability 
        
           	,cuDeviceCanAccessPeer 
        
           	,cuDeviceGetCount 
        
           	,cuDeviceGet 
        
           	,cuDeviceGetAttribute 
        
           	,cuDeviceGetLuid 
        
           	,cuDeviceGetUuid_v2 
        
           	,cuDeviceTotalMem_v2 
        
           	,cuDeviceGetName 
        
           	,cuDriverGetVersion 
        
           	,cuEventCreate 
        
           	,cuEventDestroy_v2 
        
           	,cuEventElapsedTime 
        
           	,cuEventQuery 
        
           	,cuEventRecord 
        
           	,cuEventSynchronize 
        
           	,cuFuncGetAttribute 
        
           	,cuFuncSetCacheConfig 
        
           	,cuGetErrorName 
        
           	,cuGetErrorString 
        
           	,cuGraphicsMapResources 
        
           	,cuGraphicsResourceGetMappedPointer_v2 
        
           	,cuGraphicsResourceGetMappedMipmappedArray 
        
           	,cuGraphicsSubResourceGetMappedArray 
        
           	,cuGraphicsUnmapResources 
        
           	,cuGraphicsUnregisterResource 
        
           	,cuInit 
        
           	,cuLaunchKernel 
        
           	,cuMemAlloc_v2 
        
           	,cuMemcpyDtoD_v2 
        
           	,cuMemcpyDtoH_v2 
        
           	,cuMemcpyHtoD_v2 
        
           	,cuMemcpyDtoDAsync_v2 
        
           	,cuMemcpyDtoHAsync_v2 
        
           	,cuMemcpyHtoDAsync_v2 
        
           	,cuMemGetAddressRange_v2 
        
           	,cuMemFree_v2 
        
           	,cuMemFreeHost 
        
           	,cuMemGetInfo_v2 
        
           	,cuMemHostAlloc 
        
           	,cuMemHostRegister_v2 
        
           	,cuMemHostUnregister 
        
           	,cuMemsetD32_v2 
        
           	,cuMemsetD32Async 
        
           	,cuMemsetD8_v2 
        
           	,cuMemsetD8Async 
        
           	,cuModuleGetFunction 
        
           	,cuModuleGetGlobal_v2 
        
           	,cuModuleLoadDataEx 
        
           	,cuModuleLoadFatBinary 
        
           	,cuModuleUnload 
        
           	,cuOccupancyMaxActiveBlocksPerMultiprocessor 
        
           	,cuPointerGetAttribute 
        
           	,cuStreamAddCallback 
        
           	,cuStreamCreate 
        
           	,cuStreamDestroy_v2 
        
           	,cuStreamQuery 
        
           	,cuStreamSynchronize 
        
           	,cuStreamWaitEvent 
        
           	,cuSurfObjectCreate 
        
           	,cuSurfObjectDestroy 
        
           	,cuTexObjectCreate 
        
           	,cuTexObjectDestroy 
        
           	,cuImportExternalMemory 
        
           	,cuDestroyExternalMemory 
        
           	,cuExternalMemoryGetMappedBuffer 
        
           	,cuMemUnmap 
        
           	,cuMemAddressFree 
        
           	,cuMemGetAllocationGranularity 
        
           	,cuMemAddressReserve 
        
           	,cuMemCreate 
        
           	,cuMemExportToShareableHandle 
        
           	,cuMemMap 
        
           	,cuMemRelease 
        
           	,cuMemSetAccess 
        
           	,cuMemImportFromShareableHandle 
        
           	,cuLaunchHostFunc 
        
           	,cuDestroyExternalSemaphore 
        
           	,cuImportExternalSemaphore 
        
           	,cuSignalExternalSemaphoresAsync 
        
           	,cuWaitExternalSemaphoresAsync 
        
           	,cuLogsRegisterCallback 
        
           ); 
        
           const CUDA& getCUDAFunctionTable() const {return m_cuda;} 
        
           NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader, 
        
           	nvrtcGetErrorString, 
        
           	nvrtcVersion, 
        
           	nvrtcAddNameExpression, 
        
           	nvrtcCompileProgram, 
        
           	nvrtcCreateProgram, 
        
           	nvrtcDestroyProgram, 
        
           	nvrtcGetLoweredName, 
        
           	nvrtcGetPTX, 
        
           	nvrtcGetPTXSize, 
        
           	nvrtcGetProgramLog, 
        
           	nvrtcGetProgramLogSize 
        
           ); 
        
           const NVRTC& getNVRTCFunctionTable() const {return m_nvrtc;}

CCUDAHandler.h exposes SCUDADeviceInfo with CUdevice, CUuuid, and int attributes[CU_DEVICE_ATTRIBUTE_MAX]:

Nabla/include/nbl/video/CCUDAHandler.h

Lines 190 to 199 in 0d237c0

    
           struct SCUDADeviceInfo 
        
           { 
        
           	CUdevice handle = {}; 
        
           	CUuuid uuid = {}; 
        
           	int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; 
        
           }; 
        
           inline core::vector<SCUDADeviceInfo> const& getAvailableDevices() const 
        
           { 
        
           	return m_availableDevices;

CCUDAHandler.h stores CUDA m_cuda, NVRTC m_nvrtc, and core::vector<SCUDADeviceInfo> in the public class layout:

Nabla/include/nbl/video/CCUDAHandler.h

Lines 284 to 295 in 0d237c0

    
           // function tables 
        
           CUDA m_cuda; 
        
           NVRTC m_nvrtc; 
        
           // 
        
           core::vector<SCUDADeviceInfo> m_availableDevices; 
        
           core::vector<core::smart_refctd_ptr<system::IFile>> m_headers; 
        
           core::vector<const char*> m_headerContents; 
        
           core::vector<std::string> m_headerNamesStorage; 
        
           core::vector<const char*> m_headerNames; 
        
           system::logger_opt_smart_ptr m_logger; 
        
           int m_version;

CCUDADevice.h includes CUDA headers and exposes CUDA-native declarations and members such as CUmemAllocationHandleType, CUdevice, CUmemLocationType, CUdeviceptr, and CUcontext:

Nabla/include/nbl/video/CCUDADevice.h

Lines 14 to 114 in 0d237c0

    
           #ifdef _NBL_COMPILE_WITH_CUDA_ 
        
           #include "cuda.h" 
        
           #include "nvrtc.h" 
        
           #if CUDA_VERSION < 9000 
        
           	#error "Need CUDA 9.0 SDK or higher." 
        
           #endif 
        
           // useful includes in the future 
        
           //#include "cudaEGL.h" 
        
           //#include "cudaVDPAU.h" 
        
           namespace nbl::video 
        
           { 
        
           class CCUDAHandler; 
        
           class NBL_API2 CCUDADevice : public core::IReferenceCounted 
        
           { 
        
             public: 
        
           #ifdef _WIN32 
        
           		static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32; 
        
           		static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32; 
        
           #else 
        
           		static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_FD; 
        
           		static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; 
        
           #endif 
        
           		enum E_VIRTUAL_ARCHITECTURE 
        
           		{ 
        
           			EVA_30, 
        
           			EVA_32, 
        
           			EVA_35, 
        
           			EVA_37, 
        
           			EVA_50, 
        
           			EVA_52, 
        
           			EVA_53, 
        
           			EVA_60, 
        
           			EVA_61, 
        
           			EVA_62, 
        
           			EVA_70, 
        
           			EVA_72, 
        
           			EVA_75, 
        
           			EVA_80, 
        
           			EVA_COUNT 
        
           		}; 
        
           		static inline constexpr const char* virtualArchCompileOption[] = { 
        
           			"-arch=compute_30", 
        
           			"-arch=compute_32", 
        
           			"-arch=compute_35", 
        
           			"-arch=compute_37", 
        
           			"-arch=compute_50", 
        
           			"-arch=compute_52", 
        
           			"-arch=compute_53", 
        
           			"-arch=compute_60", 
        
           			"-arch=compute_61", 
        
           			"-arch=compute_62", 
        
           			"-arch=compute_70", 
        
           			"-arch=compute_72", 
        
           			"-arch=compute_75", 
        
           			"-arch=compute_80" 
        
           		}; 
        
           		inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;} 
        
           		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, CUdevice device, core::smart_refctd_ptr<CCUDAHandler>&& handler); 
        
           		~CCUDADevice(); 
        
           		inline core::SRange<const char* const> geDefaultCompileOptions() const 
        
           		{ 
        
           			return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; 
        
           		} 
        
           		CUdevice getInternalObject() const { return m_handle; } 
        
           		const CCUDAHandler* getHandler() const { return m_handler.get();  } 
        
           		bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); } 
        
           		size_t roundToGranularity(CUmemLocationType location, size_t size) const; 
        
           		core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams); 
        
           		core::smart_refctd_ptr<CCUDAImportedMemory> importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem); 
        
           		core::smart_refctd_ptr<CCUDAImportedSemaphore> importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sem); 
        
           	private: 
        
           		CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; 
        
           		static constexpr auto CudaMemoryLocationCount = 5; 
        
               const system::logger_opt_ptr m_logger; 
        
           		std::vector<const char*> m_defaultCompileOptions; 
        
           		core::smart_refctd_ptr<CVulkanConnection> m_vulkanConnection; 
        
           		IPhysicalDevice* const m_physicalDevice; 
        
           		E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; 
        
           		core::smart_refctd_ptr<CCUDAHandler> m_handler; 
        
           		CUdevice m_handle; 
        
           		CUcontext m_context; 
        
           		std::array<size_t, CudaMemoryLocationCount> m_allocationGranularity;

CCUDAExportableMemory.h exposes CUmemLocationType, CUdeviceptr, and cached creation params in the class layout:

Nabla/include/nbl/video/CCUDAExportableMemory.h

Lines 8 to 58 in 0d237c0

    
           #ifdef _NBL_COMPILE_WITH_CUDA_ 
        
           #include "cuda.h" 
        
           #include "nvrtc.h" 
        
           #if CUDA_VERSION < 9000 
        
             #error "Need CUDA 9.0 SDK or higher." 
        
           #endif 
        
           // useful includes in the future 
        
           //#include "cudaEGL.h" 
        
           //#include "cudaVDPAU.h" 
        
           namespace nbl::video 
        
           { 
        
           class CCUDADevice; 
        
           class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted 
        
           { 
        
               public: 
        
                   struct SCreationParams 
        
                   { 
        
                       size_t            size; 
        
                       uint32_t          alignment; 
        
                       CUmemLocationType location; 
        
                   }; 
        
                   struct SCachedCreationParams : SCreationParams 
        
                   { 
        
                       size_t granularSize; 
        
                       CUdeviceptr ptr; 
        
                       external_handle_t externalHandle; 
        
                   }; 
        
                   CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params) 
        
                       : m_device(std::move(device)) 
        
                       , m_params(std::move(params)) 
        
                   {} 
        
                   ~CCUDAExportableMemory() override; 
        
                   CUdeviceptr getDeviceptr() const { return m_params.ptr;  } 
        
                   const SCreationParams& getCreationParams() const { return m_params; } 
        
                   core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; 
        
               private: 
        
                   core::smart_refctd_ptr<CCUDADevice> m_device; 
        
                   SCachedCreationParams m_params;

CCUDAImportedMemory.h exposes CUexternalMemory, CUdeviceptr, and inline native handle access:

Nabla/include/nbl/video/CCUDAImportedMemory.h

Lines 4 to 36 in 0d237c0

    
           #ifdef _NBL_COMPILE_WITH_CUDA_ 
        
           #include "cuda.h" 
        
           #include "nvrtc.h" 
        
           #if CUDA_VERSION < 9000 
        
             #error "Need CUDA 9.0 SDK or higher." 
        
           #endif 
        
           #endif // _NBL_COMPILE_WITH_CUDA 
        
           namespace nbl::video 
        
           { 
        
           class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted 
        
           { 
        
               public: 
        
                 CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<nbl::video::IDeviceMemoryAllocation> src, 
        
                   CUexternalMemory cuExtMem) :  
        
                   m_device(device), 
        
                   m_src(src), 
        
                   m_handle(cuExtMem) {} 
        
                 ~CCUDAImportedMemory() override; 
        
                 CUexternalMemory getInternalObject() const { return m_handle; } 
        
                 CUresult getMappedBuffer(CUdeviceptr* mappedBuffer); 
        
               private: 
        
                 core::smart_refctd_ptr<CCUDADevice> m_device; 
        
                 core::smart_refctd_ptr<IDeviceMemoryAllocation> m_src; 
        
                 CUexternalMemory m_handle;

CCUDAImportedSemaphore.h exposes CUexternalSemaphore and inline native handle access:

Nabla/include/nbl/video/CCUDAImportedSemaphore.h

Lines 7 to 39 in 0d237c0

    
           #ifdef _NBL_COMPILE_WITH_CUDA_ 
        
           #include "cuda.h" 
        
           #include "nvrtc.h" 
        
           #if CUDA_VERSION < 9000 
        
             #error "Need CUDA 9.0 SDK or higher." 
        
           #endif 
        
           // useful includes in the future 
        
           //#include "cudaEGL.h" 
        
           //#include "cudaVDPAU.h" 
        
           namespace nbl::video 
        
           { 
        
           class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted 
        
           { 
        
               public: 
        
                 CUexternalSemaphore getInternalObject() const { return m_handle; } 
        
                 CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device,  
        
                   core::smart_refctd_ptr<ISemaphore> src,  
        
                   CUexternalSemaphore semaphore) 
        
                     : m_device(std::move(device)) 
        
                     , m_src(std::move(src)) 
        
                     , m_handle(semaphore) 
        
                 {} 
        
                 ~CCUDAImportedSemaphore() override; 
        
               private: 
        
                 core::smart_refctd_ptr<CCUDADevice> m_device; 
        
                 core::smart_refctd_ptr<ISemaphore> m_src; 
        
                 CUexternalSemaphore m_handle;

CCUDAHandler.cpp fills m_availableDevices and writes attributes using CU_DEVICE_ATTRIBUTE_MAX from the SDK that built the binary:

Nabla/src/nbl/video/CCUDAHandler.cpp

Lines 15 to 55 in 0d237c0

    
           CCUDAHandler::CCUDAHandler( 
        
           	CUDA&& _cuda,  
        
           	NVRTC&& _nvrtc,  
        
           	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers,  
        
           	core::smart_refctd_ptr<system::ILogger>&& _logger, 
        
           	int _version) 
        
           	: m_cuda(std::move(_cuda)) 
        
           	, m_nvrtc(std::move(_nvrtc)) 
        
           	, m_headers(std::move(_headers)) 
        
           	, m_logger(std::move(_logger)) 
        
           	, m_version(_version) 
        
           { 
        
           	for (auto& header : m_headers) 
        
           	{ 
        
           		m_headerContents.push_back(reinterpret_cast<const char*>(header->getMappedPointer())); 
        
           		m_headerNamesStorage.push_back(header->getFileName().string()); 
        
           		m_headerNames.push_back(m_headerNamesStorage.back().c_str()); 
        
           	} 
        
           	int deviceCount = 0; 
        
           	if (m_cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) 
        
           		return; 
        
           	for (int device_i = 0; device_i < deviceCount; device_i++) 
        
           	{ 
        
           		CUdevice handle = -1; 
        
           		if (m_cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0) 
        
           			continue; 
        
           		CUuuid uuid = {}; 
        
           		if (m_cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS) 
        
           			continue; 
        
           		m_availableDevices.emplace_back(handle, uuid); 
        
           		int* attributes = m_availableDevices.back().attributes; 
        
           		for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++) 
        
           			m_cuda.pcuDeviceGetAttribute(attributes + i, static_cast<CUdevice_attribute>(i), handle); 
        
           	} 
        
           }

CCUDAHandler.cpp later reads those cached attributes through the same SCUDADeviceInfo layout:

Nabla/src/nbl/video/CCUDAHandler.cpp

Lines 573 to 587 in 0d237c0

    
           core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice) 
        
           { 
        
           	if (!vulkanConnection) 
        
           		return nullptr; 
        
           	const auto devices = vulkanConnection->getPhysicalDevices(); 
        
           	if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) 
        
           		return nullptr; 
        
           	for (const auto& device : m_availableDevices) 
        
           	{ 
        
           		if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) 
        
           		{ 
        
           			CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT; 
        
           			const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; 
        
           			const int& archMinor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR];

If we move this shape into Nabla::ext::CUDAInterop, the code no longer leaks through Nabla::Nabla, but these CUDA-native headers become the public ABI and compile-time dependency of the extension target.

A consumer then compiles those public extension headers again with its own CUDA SDK.

Inside our current Nabla examples build this may look fine. The examples are configured in the same build interface as Nabla and the CUDA interop target, so they naturally see the same CUDA::toolkit that built the extension.

But that is one project setup, not a general CMake guarantee. Another project can consume Nabla through add_subdirectory, a nested build, an external build, custom toolchain files, cache overrides, or manually provided CUDA targets and make the consumer side see a different SDK.

With an installed package this becomes normal downstream usage. Our package config does the same kind of CUDA discovery on the consumer side. It accepts Nabla_CUDA_TOOLKIT_ROOT, calls find_dependency(CUDAToolkit 13.0 REQUIRED), and links Nabla::ext::CUDAInterop to the consumer-resolved CUDA::toolkit:

Nabla/cmake/NablaConfig.cmake.in

Lines 88 to 100 in ffba3d4

    
           if(_NBL_NABLA_LOAD_CUDA_INTEROP) 
        
             include(CMakeFindDependencyMacro) 
        
             if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "") 
        
               set(CUDAToolkit_ROOT "${Nabla_CUDA_TOOLKIT_ROOT}") 
        
             endif() 
        
             find_dependency(CUDAToolkit 13.0 REQUIRED) 
        
             _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND) 
        
             if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop) 
        
               target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit) 
        
               if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake") 
        
                 include("${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake")

So the extension binary may have been built with CUDA SDK A, while the downstream project resolves CUDA SDK B through its own package configure. We should not force that downstream SDK to be byte-for-byte the same SDK installed on the host that built the package. If the public extension headers contain SDK-defined layout, the downstream translation units compile that layout from SDK B while the already-built extension binary was compiled with SDK A.

Actual Risk

Raw CUDA handles like CUdevice, CUcontext, CUdeviceptr, CUexternalMemory, and CUexternalSemaphore are not the main ABI problem by themselves. They are handle-like values.

The risky part is exposing SDK-defined layout and inline access through the public extension headers.

int attributes[CU_DEVICE_ATTRIBUTE_MAX] makes SCUDADeviceInfo size depend on the CUDA SDK version.
CUDA m_cuda and NVRTC m_nvrtc make the class layout depend on generated function table members.
inline getters return references into those public-layout objects, so the consumer compiles member offsets and vector element stride from its own CUDA headers.

Consequences

The extension ABI depends on the CUDA SDK header layout.

I decided to check this directly and installed multiple CUDA SDK versions locally. CUuuid is stable in the checked SDKs.

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\include\cuda.h:286
    typedef struct CUuuid_st {
    char bytes[16]

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\include\cuda.h:303
    typedef struct CUuuid_st {
    char bytes[16]

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include\cuda.h:324
    typedef struct CUuuid_st {
    char bytes[16]

CUdevice is also a handle-like int in the checked SDKs.

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\include\cuda.h:258
    typedef int CUdevice_v1
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\include\cuda.h:259
    typedef CUdevice_v1 CUdevice

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\include\cuda.h:266
    typedef int CUdevice_v1
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\include\cuda.h:267
    typedef CUdevice_v1 CUdevice

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include\cuda.h:287
    typedef int CUdevice_v1
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include\cuda.h:288
    typedef CUdevice_v1 CUdevice

But CU_DEVICE_ATTRIBUTE_MAX is not stable.

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\include\cuda.h:755
    CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED = 132
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\include\cuda.h:756
    CU_DEVICE_ATTRIBUTE_MAX
    so CU_DEVICE_ATTRIBUTE_MAX == 133

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\include\cuda.h:841
    CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED = 135
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\include\cuda.h:842
    CU_DEVICE_ATTRIBUTE_MAX
    so CU_DEVICE_ATTRIBUTE_MAX == 136

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include\cuda.h:965
    CU_DEVICE_ATTRIBUTE_ATOMIC_REDUCTION_SUPPORTED = 148
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include\cuda.h:966
    CU_DEVICE_ATTRIBUTE_MAX
    so CU_DEVICE_ATTRIBUTE_MAX == 149

The point is not that we want to support CUDA 12.x for this feature. The point is that NVIDIA already changed this value across minor releases inside the same major line. CUDA 12.1 has 133 and CUDA 12.5 has 136. Requiring CUDA 13.x does not show that CUDA 13.0, 13.1, 13.2, or a later 13.x SDK will keep every public-layout macro and generated function table shape identical.

So the same public struct has different sizes.

Built with CUDA 12.1

SCUDADeviceInfo
    CUdevice handle                      4 bytes
    CUuuid uuid                          16 bytes
    int attributes[133]                  532 bytes
Total                                    552 bytes with 4-byte alignment

Consumed with CUDA 13.2

SCUDADeviceInfo
    CUdevice handle                      4 bytes
    CUuuid uuid                          16 bytes
    int attributes[149]                  596 bytes
Total                                    616 bytes with 4-byte alignment

If the extension binary fills core::vector<SCUDADeviceInfo> using 552-byte elements and the consumer iterates it using 616-byte elements, the consumer walks the vector with the wrong stride. This is an ABI bug even when the extension is a DLL.

DLL does not remove the public-header ABI problem.

A DLL moves implementation into another binary, but the consumer still compiles inline methods, templates, member offsets, and sizeof(SCUDADeviceInfo) from the public header.

const auto& devices = handler->getAvailableDevices()

consumer-side codegen uses sizeof(SCUDADeviceInfo) from the consumer SDK
for each device in devices
    useUuid(device.uuid)

The DLL cannot fix the fact that the consumer and the DLL disagree on the vector element type layout. A static library has the same issue unless it is rebuilt from source with the exact same CUDA SDK as the consumer.

Public loader tables are also ABI surface.

In the original header, CUDA m_cuda and NVRTC m_nvrtc were members of CCUDAHandler.

Nabla's loader macro expands every listed function into a typed member.

include/nbl/system/DynamicLibraryFunctionPointer.h:88

#define NBL_SYSTEM_DECLARE_DYNLIB_FUNCPTR(FUNC_NAME) \
    nbl::system::DynamicLibraryFunctionPointer<decltype(FUNC_NAME),NBL_CORE_UNIQUE_STRING_LITERAL_TYPE(#FUNC_NAME)> p ## FUNC_NAME

include/nbl/system/DynamicFunctionCaller.h:69

NBL_FOREACH(NBL_SYSTEM_DECLARE_DYNLIB_FUNCPTR,__VA_ARGS__)

That means the function pointer type is taken directly from the CUDA/NVRTC header used by the current translation unit.

Local CUDA headers show version-sensitive API surface.

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\include\cuda.h
    cuCtxCreate_v4 not present
    cuLogsRegisterCallback not present
    cuDeviceGetUuid_v2 present

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\include\cuda.h
    cuCtxCreate_v4 present
    cuLogsRegisterCallback not present
    cuDeviceGetUuid_v2 present

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include\cuda.h
    #define cuCtxCreate cuCtxCreate_v4
    cuLogsRegisterCallback present at line 26475
    #define cuDeviceGetUuid cuDeviceGetUuid_v2

If the generated loader tables are public members of CCUDAHandler, then their member list and decltype(...) signatures become part of the public class layout of the extension binary.

Every CCUDA*.h user gets a compile-time SDK dependency.

app.cpp
  -> nbl/ext/CUDAInterop/CCUDAHandler.h
    -> cuda.h
    -> nvrtc.h
    -> CUDA SDK headers

Updating the CUDA SDK path, version, or package can invalidate every translation unit that includes those headers. That is true even when the consumer only wants CCUDAHandler, CCUDADevice, or a helper inline method and does not directly call CUDA.

The compatibility statement becomes weaker.

With CUDA SDK layout in public extension classes, the true statement is this.

The consumer can use a different CUDA SDK only if its headers remain ABI-compatible with the SDK used to build `Nabla::ext::CUDAInterop`.

It is not this.

The consumer can freely use any newer CUDA SDK with an already-built extension binary.

Build-tree examples do not show the stronger statement. They show our current single-configure examples setup where Nabla, the extension, and the examples share the same CUDA SDK discovery. Other build-interface setups, add_subdirectory consumers, installed packages, and downstream overrides are the cases where the boundary matters.

Accessor Model

The public Nabla CUDA object layout stays SDK-free.

class CCUDAHandler
{
public:
    struct SCUDADeviceInfo
    {
        std::array<uint8_t,16> uuid = {}
    }

    const core::vector<SCUDADeviceInfo>& getAvailableDevices() const

private:
    struct SNativeState
    std::unique_ptr<SNativeState> m_native
    core::vector<SCUDADeviceInfo> m_availableDevices
}

CUDA-dependent layout stays private to the binary.

struct CCUDAHandler::SNativeState
{
    CUDA cuda
    NVRTC nvrtc
    core::vector<cuda_native::SCUDADeviceInfo> availableDevices
}

Native CUDA access is still explicit opt-in.

#include <nbl/ext/CUDAInterop/CUDAInteropNative.h>

const CUDA& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*handler)
const auto& nativeDevices = cuda_native::CCUDAHandlerAccessor::getAvailableDevices(*handler)

So the plugin as Nabla extension model is workable. If it exposes the original CUDA-heavy CCUDA*.h layout publicly, we choose raw CUDA ergonomics over ABI and rebuild isolation.

The accessor/native-state approach is not a CUDA wrapper. It is a small glue layer between Nabla objects and the CUDA world, only for the interop objects that cross that boundary. Raw CUDA is still available through the explicit opt-in header.

TLDR

Yes, we can move all CUDA interop code into Nabla::ext::CUDAInterop and expose CUDA directly from the extension headers. That gives nicer raw CUDA ergonomics and avoids a few accessor calls.

Then we explicitly accept that Nabla::ext::CUDAInterop has CUDA SDK-dependent public ABI. To be safe, consumers should build against the same CUDA SDK that was used to build the extension binary, or rebuild the extension with their SDK. Otherwise layout mismatches like CU_DEVICE_ATTRIBUTE_MAX can produce real ABI bugs. Even if this is acceptable for this plugin, in my opinion it is limiting, and this is why I introduced the accessor/native-state model at the cost of slightly worse raw CUDA ergonomics.

This can pass cleanly in our current examples build because Nabla, the extension, and examples are configured together. That does not cover every build-interface setup, add_subdirectory consumer, or the package/install interface where downstream CMake may find or override a different CUDA SDK. Then the public header layout can disagree with the prebuilt extension binary.

The tradeoff is:

plugin with public CUDA layout means fewer glue lines and nicer direct CUDA API, but SDK becomes a public compile-time dependency of the extension and ABI compatibility depends on CUDA header compatibility.
accessor/native-state model means a few extra explicit calls, but stable SDK-free public Nabla layout, smaller rebuild surface, and consumer-side CUDA SDK version can move independently for native opt-in code.

We can move it into a plugin as Nabla extension if that is the preferred direction. My subjective take is that I do not like that model very much, but we can do it as long as we are explicit that this accepts the SDK-dependent public ABI and the potential mismatch, rebuild, and package issues described above.

devshgraphicsprogramming · 2026-05-08T09:09:52Z

Excellent report, well presented.

I can see the problem now.

One thing to note that newer CUDA SDK versions are not allowed to change handle definitions like CUDeviceptr and external semaphore, otherwise app compiled against lower version SDK wouldn't run with a newer runtime.

That means the function pointer type is taken directly from the CUDA/NVRTC header used by the current translation unit.

Same thing for function signatures like cuCtx, this is why they have v2 and even v4. They cannot change the signature because CUDA Driver (not runtime) API is dynamically linked and you must be able.to run with later driver update.

Local CUDA headers show version-sensitive API surface.

What you've uncovered here is the reason why we require the SDK version to be >=13.0 because one of the function pointers in the table can only be loaded from that SDK.

Although I don't intend to provide a guarantee that every function pointer will be loaded and available, it's meant to be a quickly hacked together low effort Volk but for CUDA. Except with no global C functions or state.

The CU_DEVICE_ATTRIBUTE_MAX problem is interesting indeed, and highlights why your line of reasoning is correct.

I think there's a middle ground compromise we can come to

devshgraphicsprogramming · 2026-05-08T10:51:46Z

Ok lets split into two targets with optional leaking headers

But instead of accessors lets abuse implicit conversions and constructors
https://godbolt.org/z/WsGqcc1f4

struct alignas(8) OpaqueCUdeviceptr
{
    uint8_t value[8];
};

//===================================LEAK BOUNDARY========================================
// CUDA SDK DEP now
//#include CUDA
using CUdeviceptr = void*;

// trait
template<typename T>
struct opaque_cuda_type;

template<>
struct opaque_cuda_type<OpaqueCUdeviceptr>
{
    using type = CUdeviceptr;
};

template<typename Opaque>
struct Wrapped
{
    using cuda_t = typename opaque_cuda_type<Opaque>::type;
    static_assert(std::is_trivial_v<cuda_t>);
    static_assert(sizeof(Opaque)==sizeof(cuda_t));
    static_assert(alignof(Opaque)==alignof(cuda_t));

    // constucotrs
    Wrapped() = default;
    Wrapped(const Wrapped&) = default;
    Wrapped(const cuda_t& val) {operator=(val);}
    Wrapped(const Opaque& val) {operator=(val);}

    // assignment
    inline Wrapped& operator=(const Wrapped&) = default;
    inline Wrapped& operator=(const cuda_t& val) {value = val; return *this;}
    inline Wrapped& operator=(const Opaque& val) {operator Opaque&() = val; return *this;}

    // immplicit convert to original value reference
    inline operator cuda_t&() {return value;}
    inline operator const cuda_t&() const {return value;}

    // implicit conver to opaque handle reference
    inline operator Opaque&() {return reinterpret_cast<Opaque&>(value);}
    inline operator const Opaque&() const {return reinterpret_cast<const Opaque&>(value);}

    cuda_t value = {};
};

// Type your code here, or load an example.
CUdeviceptr pCudaCall(CUdeviceptr);

CUdeviceptr square(int num)
{
    OpaqueCUdeviceptr op = {};

    Wrapped<OpaqueCUdeviceptr> wp(op);
    wp = op;

    wp = pCudaCall(wp);
    op = wp;

    return wp;
}

AnastaZIuk added 27 commits May 6, 2026 09:28

Move CUDA interop behind extension target

f4ce3dc

Address CUDA interop review cleanup

78845ae

Simplify CUDA interop smoke CMake

ab9a7e5

Clean CUDA interop smoke usage requirements

bf8eeb3

Export CUDA interop package target

f701ac6

Use CUDAToolkit package targets

a520d57

Require CUDA version via CMake

4bddc57

Split CUDA interop native surface

6f68e66

Add native CUDA accessor overloads

49bcb2c

Document CUDA interop target split

d85657e

Trim CUDA interop README wording

6e8c4f9

Move CUDA interop into Nabla

881e9b8

Document CUDA interop accessor model

5dd1134

Inline CUDA interop stubs

e514df7

Refine CUDA interop boundary

e53c838

Add CUDA interop runtime header discovery

1417905

Tighten CUDA interop native helpers

045432e

Hide CUDA interop native state construction

8a119dd

Clean up CUDA runtime header discovery

e018545

Move CUDA interop API back into video

c6ef6ee

Move smart pointer helpers into core

d559a2c

Use CUDA interop accessors

38705b9

Use explicit CUDA compile log

23e6ef5

Trim CUDA interop API surface

a640183

Keep CUDA SDK layouts private

5bf0e2d

Simplify CUDA interop helper

d745421

Update CUDA interop examples pointer

ffba3d4

devshgraphicsprogramming reviewed May 7, 2026

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Move CUDA interop behind native opt-in#1067

Move CUDA interop behind native opt-in#1067
AnastaZIuk wants to merge 27 commits intovk_cuda_interopfrom
cuInteropBS

AnastaZIuk commented May 7, 2026

Uh oh!

devshgraphicsprogramming May 7, 2026

Uh oh!

devshgraphicsprogramming May 7, 2026

Uh oh!

devshgraphicsprogramming May 7, 2026

Uh oh!

devshgraphicsprogramming May 7, 2026

Uh oh!

devshgraphicsprogramming May 7, 2026

Uh oh!

devshgraphicsprogramming May 7, 2026

Uh oh!

AnastaZIuk commented May 8, 2026

Uh oh!

devshgraphicsprogramming commented May 8, 2026

Uh oh!

devshgraphicsprogramming commented May 8, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Conversation

AnastaZIuk commented May 7, 2026

Uh oh!

devshgraphicsprogramming May 7, 2026

Choose a reason for hiding this comment

Uh oh!

devshgraphicsprogramming May 7, 2026

Choose a reason for hiding this comment

Uh oh!

devshgraphicsprogramming May 7, 2026

Choose a reason for hiding this comment

Uh oh!

devshgraphicsprogramming May 7, 2026

Choose a reason for hiding this comment

Uh oh!

devshgraphicsprogramming May 7, 2026

Choose a reason for hiding this comment

Uh oh!

devshgraphicsprogramming May 7, 2026

Choose a reason for hiding this comment

Uh oh!

AnastaZIuk commented May 8, 2026

CUDA interop extension ABI

Proposed Plugin Shape

Actual Risk

Consequences

Accessor Model

TLDR

Uh oh!

devshgraphicsprogramming commented May 8, 2026

Uh oh!

devshgraphicsprogramming commented May 8, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants