Skip to content

Move CUDA interop behind native opt-in#1067

Open
AnastaZIuk wants to merge 27 commits intovk_cuda_interopfrom
cuInteropBS
Open

Move CUDA interop behind native opt-in#1067
AnastaZIuk wants to merge 27 commits intovk_cuda_interopfrom
cuInteropBS

Conversation

@AnastaZIuk
Copy link
Copy Markdown
Member

Moves CUDA interop behind SDK-free Nabla headers with explicit Nabla::ext::CUDAInterop native opt-in. Keeps raw CUDA/NVRTC access available for consumers that ask for native opt-in while avoiding default public SDK requirements.

Comment on lines +163 to +210
// Opt-in native CUDA API. The declarations below are implemented by the Nabla library.
// This header is intentionally the only public path that includes CUDA SDK types.
class NBL_API2 CCUDAHandlerAccessor
{
public:
static const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
static const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
static bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
static bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
static const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
static nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
static nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
static nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
static SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
static SPTXResult compileDirectlyToPTX(
CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
);
};

class NBL_API2 CCUDADeviceAccessor
{
public:
static CUdevice getInternalObject(const CCUDADevice& device);
static CUcontext getContext(const CCUDADevice& device);
static size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
static core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params);
};

class NBL_API2 CCUDAExportableMemoryAccessor
{
public:
static CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
};

class NBL_API2 CCUDAImportedMemoryAccessor
{
public:
static CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
static CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
};

class NBL_API2 CCUDAImportedSemaphoreAccessor
{
public:
static CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
};
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

accessors make no sense just move all the nbl/video/CCUDA*.h to the extension

Comment on lines -298 to -305
#define ASSERT_CUDA_SUCCESS(expr, handler) \
do { \
const auto cudaResult = (expr); \
if (!((handler)->defaultHandleResult(cudaResult))) { \
assert(false); \
} \
} while(0)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that macro was useful just needs a rename

Comment on lines -31 to -52
inline bool CloseExternalHandle(external_handle_t handle)
{
#ifdef _WIN32
return CloseHandle(handle);
#else
return (close(handle) == 0);
#endif
}

inline external_handle_t DuplicateExternalHandle(external_handle_t handle)
{
#ifdef _WIN32
HANDLE re = ExternalHandleNull;

const HANDLE cur = GetCurrentProcess();
if (!DuplicateHandle(cur, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS))
return ExternalHandleNull;

return re;
#else
return dup(handle);
#endif
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you may want to keep that inline, these are OS calls, and when they're inline they'll work BEFORE Nabla.dll is delay loaded, which is useful

Comment on lines +1 to +29
#include "nbl/video/CUDAInterop.h"
#include "nbl/system/IApplicationFramework.h"

#include <type_traits>

#ifdef _NBL_COMPILE_WITH_CUDA_
#error "Nabla::Nabla must not propagate the CUDA build define."
#endif

#ifdef CUDA_VERSION
#error "Nabla::Nabla must not require CUDA SDK headers."
#endif

namespace
{

class CUDAInteropCleanOptInSmoke final : public nbl::system::IApplicationFramework
{
using base_t = nbl::system::IApplicationFramework;

public:
using base_t::base_t;

bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override
{
static_assert(std::is_class_v<nbl::video::CCUDADevice>);
static_assert(std::is_class_v<nbl::video::CCUDAExportableMemory>);
static_assert(std::is_class_v<nbl::video::CCUDAImportedMemory>);
static_assert(std::is_class_v<nbl::video::CCUDAImportedSemaphore>);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would make more sense to not have anything CUDA related in Nabla itself

const auto& granularity = SAccess::native(device).allocationGranularity[location];
return ((size - 1) / granularity + 1) * granularity;
}

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mentioned in the original PR, this should be inline

Comment on lines +104 to +119
if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
assert(false);
return err;
}

CUmemAccessDesc accessDesc = {
.location = { .type = location, .id = m_handle },
.location = { .type = location, .id = native.handle },
.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE,
};

if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err)
{
ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), m_handler);
ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler);
if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size)))
assert(false);
if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
assert(false);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kevyuu thinking of it we shouldn't crash an entire program vecause of failure here :s

@AnastaZIuk
Copy link
Copy Markdown
Member Author

CUDA interop extension ABI

As discussed on Discord, the proposal is to move all CUDA interop code into a plugin as Nabla extension and expose CUDA directly from that target's public headers.

We can do that. It protects Nabla::Nabla from the CUDA SDK leak. It does not remove the boundary problem though. It moves that problem to Nabla::ext::CUDAInterop, and then the extension target owns the ABI and versioning consequences.

The goal is not to wrap CUDA. The goal is to keep CUDA SDK-defined layout out of the public class layout.

Proposed Plugin Shape

Original PR: #1061

Let's say we move the original PR shape into the extension target. In other words, the CUDA interop code no longer lives in Nabla::Nabla, but the public CUDA-native headers stay effectively the same and move from include/nbl/video/CCUDA*.h to include/nbl/ext/CUDAInterop/CCUDA*.h.

The exact code shape from the last commit currently on origin/vk_cuda_interop is here:

  • CCUDAHandler.h exposes the CUDA and NVRTC loader table types plus inline getters:
    NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader
    ,cuCtxCreate_v4
    ,cuDevicePrimaryCtxRetain
    ,cuDevicePrimaryCtxRelease
    ,cuDevicePrimaryCtxSetFlags
    ,cuDevicePrimaryCtxGetState
    ,cuCtxDestroy_v2
    ,cuCtxEnablePeerAccess
    ,cuCtxGetApiVersion
    ,cuCtxGetCurrent
    ,cuCtxGetDevice
    ,cuCtxGetSharedMemConfig
    ,cuCtxPopCurrent_v2
    ,cuCtxPushCurrent_v2
    ,cuCtxSetCacheConfig
    ,cuCtxSetCurrent
    ,cuCtxSetSharedMemConfig
    ,cuCtxSynchronize
    ,cuDeviceComputeCapability
    ,cuDeviceCanAccessPeer
    ,cuDeviceGetCount
    ,cuDeviceGet
    ,cuDeviceGetAttribute
    ,cuDeviceGetLuid
    ,cuDeviceGetUuid_v2
    ,cuDeviceTotalMem_v2
    ,cuDeviceGetName
    ,cuDriverGetVersion
    ,cuEventCreate
    ,cuEventDestroy_v2
    ,cuEventElapsedTime
    ,cuEventQuery
    ,cuEventRecord
    ,cuEventSynchronize
    ,cuFuncGetAttribute
    ,cuFuncSetCacheConfig
    ,cuGetErrorName
    ,cuGetErrorString
    ,cuGraphicsMapResources
    ,cuGraphicsResourceGetMappedPointer_v2
    ,cuGraphicsResourceGetMappedMipmappedArray
    ,cuGraphicsSubResourceGetMappedArray
    ,cuGraphicsUnmapResources
    ,cuGraphicsUnregisterResource
    ,cuInit
    ,cuLaunchKernel
    ,cuMemAlloc_v2
    ,cuMemcpyDtoD_v2
    ,cuMemcpyDtoH_v2
    ,cuMemcpyHtoD_v2
    ,cuMemcpyDtoDAsync_v2
    ,cuMemcpyDtoHAsync_v2
    ,cuMemcpyHtoDAsync_v2
    ,cuMemGetAddressRange_v2
    ,cuMemFree_v2
    ,cuMemFreeHost
    ,cuMemGetInfo_v2
    ,cuMemHostAlloc
    ,cuMemHostRegister_v2
    ,cuMemHostUnregister
    ,cuMemsetD32_v2
    ,cuMemsetD32Async
    ,cuMemsetD8_v2
    ,cuMemsetD8Async
    ,cuModuleGetFunction
    ,cuModuleGetGlobal_v2
    ,cuModuleLoadDataEx
    ,cuModuleLoadFatBinary
    ,cuModuleUnload
    ,cuOccupancyMaxActiveBlocksPerMultiprocessor
    ,cuPointerGetAttribute
    ,cuStreamAddCallback
    ,cuStreamCreate
    ,cuStreamDestroy_v2
    ,cuStreamQuery
    ,cuStreamSynchronize
    ,cuStreamWaitEvent
    ,cuSurfObjectCreate
    ,cuSurfObjectDestroy
    ,cuTexObjectCreate
    ,cuTexObjectDestroy
    ,cuImportExternalMemory
    ,cuDestroyExternalMemory
    ,cuExternalMemoryGetMappedBuffer
    ,cuMemUnmap
    ,cuMemAddressFree
    ,cuMemGetAllocationGranularity
    ,cuMemAddressReserve
    ,cuMemCreate
    ,cuMemExportToShareableHandle
    ,cuMemMap
    ,cuMemRelease
    ,cuMemSetAccess
    ,cuMemImportFromShareableHandle
    ,cuLaunchHostFunc
    ,cuDestroyExternalSemaphore
    ,cuImportExternalSemaphore
    ,cuSignalExternalSemaphoresAsync
    ,cuWaitExternalSemaphoresAsync
    ,cuLogsRegisterCallback
    );
    const CUDA& getCUDAFunctionTable() const {return m_cuda;}
    NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader,
    nvrtcGetErrorString,
    nvrtcVersion,
    nvrtcAddNameExpression,
    nvrtcCompileProgram,
    nvrtcCreateProgram,
    nvrtcDestroyProgram,
    nvrtcGetLoweredName,
    nvrtcGetPTX,
    nvrtcGetPTXSize,
    nvrtcGetProgramLog,
    nvrtcGetProgramLogSize
    );
    const NVRTC& getNVRTCFunctionTable() const {return m_nvrtc;}
  • CCUDAHandler.h exposes SCUDADeviceInfo with CUdevice, CUuuid, and int attributes[CU_DEVICE_ATTRIBUTE_MAX]:
    struct SCUDADeviceInfo
    {
    CUdevice handle = {};
    CUuuid uuid = {};
    int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
    };
    inline core::vector<SCUDADeviceInfo> const& getAvailableDevices() const
    {
    return m_availableDevices;
  • CCUDAHandler.h stores CUDA m_cuda, NVRTC m_nvrtc, and core::vector<SCUDADeviceInfo> in the public class layout:
    // function tables
    CUDA m_cuda;
    NVRTC m_nvrtc;
    //
    core::vector<SCUDADeviceInfo> m_availableDevices;
    core::vector<core::smart_refctd_ptr<system::IFile>> m_headers;
    core::vector<const char*> m_headerContents;
    core::vector<std::string> m_headerNamesStorage;
    core::vector<const char*> m_headerNames;
    system::logger_opt_smart_ptr m_logger;
    int m_version;
  • CCUDADevice.h includes CUDA headers and exposes CUDA-native declarations and members such as CUmemAllocationHandleType, CUdevice, CUmemLocationType, CUdeviceptr, and CUcontext:
    #ifdef _NBL_COMPILE_WITH_CUDA_
    #include "cuda.h"
    #include "nvrtc.h"
    #if CUDA_VERSION < 9000
    #error "Need CUDA 9.0 SDK or higher."
    #endif
    // useful includes in the future
    //#include "cudaEGL.h"
    //#include "cudaVDPAU.h"
    namespace nbl::video
    {
    class CCUDAHandler;
    class NBL_API2 CCUDADevice : public core::IReferenceCounted
    {
    public:
    #ifdef _WIN32
    static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32;
    static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32;
    #else
    static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_FD;
    static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
    #endif
    enum E_VIRTUAL_ARCHITECTURE
    {
    EVA_30,
    EVA_32,
    EVA_35,
    EVA_37,
    EVA_50,
    EVA_52,
    EVA_53,
    EVA_60,
    EVA_61,
    EVA_62,
    EVA_70,
    EVA_72,
    EVA_75,
    EVA_80,
    EVA_COUNT
    };
    static inline constexpr const char* virtualArchCompileOption[] = {
    "-arch=compute_30",
    "-arch=compute_32",
    "-arch=compute_35",
    "-arch=compute_37",
    "-arch=compute_50",
    "-arch=compute_52",
    "-arch=compute_53",
    "-arch=compute_60",
    "-arch=compute_61",
    "-arch=compute_62",
    "-arch=compute_70",
    "-arch=compute_72",
    "-arch=compute_75",
    "-arch=compute_80"
    };
    inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;}
    CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, CUdevice device, core::smart_refctd_ptr<CCUDAHandler>&& handler);
    ~CCUDADevice();
    inline core::SRange<const char* const> geDefaultCompileOptions() const
    {
    return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()};
    }
    CUdevice getInternalObject() const { return m_handle; }
    const CCUDAHandler* getHandler() const { return m_handler.get(); }
    bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); }
    size_t roundToGranularity(CUmemLocationType location, size_t size) const;
    core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams);
    core::smart_refctd_ptr<CCUDAImportedMemory> importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem);
    core::smart_refctd_ptr<CCUDAImportedSemaphore> importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sem);
    private:
    CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const;
    static constexpr auto CudaMemoryLocationCount = 5;
    const system::logger_opt_ptr m_logger;
    std::vector<const char*> m_defaultCompileOptions;
    core::smart_refctd_ptr<CVulkanConnection> m_vulkanConnection;
    IPhysicalDevice* const m_physicalDevice;
    E_VIRTUAL_ARCHITECTURE m_virtualArchitecture;
    core::smart_refctd_ptr<CCUDAHandler> m_handler;
    CUdevice m_handle;
    CUcontext m_context;
    std::array<size_t, CudaMemoryLocationCount> m_allocationGranularity;
  • CCUDAExportableMemory.h exposes CUmemLocationType, CUdeviceptr, and cached creation params in the class layout:
    #ifdef _NBL_COMPILE_WITH_CUDA_
    #include "cuda.h"
    #include "nvrtc.h"
    #if CUDA_VERSION < 9000
    #error "Need CUDA 9.0 SDK or higher."
    #endif
    // useful includes in the future
    //#include "cudaEGL.h"
    //#include "cudaVDPAU.h"
    namespace nbl::video
    {
    class CCUDADevice;
    class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
    {
    public:
    struct SCreationParams
    {
    size_t size;
    uint32_t alignment;
    CUmemLocationType location;
    };
    struct SCachedCreationParams : SCreationParams
    {
    size_t granularSize;
    CUdeviceptr ptr;
    external_handle_t externalHandle;
    };
    CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params)
    : m_device(std::move(device))
    , m_params(std::move(params))
    {}
    ~CCUDAExportableMemory() override;
    CUdeviceptr getDeviceptr() const { return m_params.ptr; }
    const SCreationParams& getCreationParams() const { return m_params; }
    core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
    private:
    core::smart_refctd_ptr<CCUDADevice> m_device;
    SCachedCreationParams m_params;
  • CCUDAImportedMemory.h exposes CUexternalMemory, CUdeviceptr, and inline native handle access:
    #ifdef _NBL_COMPILE_WITH_CUDA_
    #include "cuda.h"
    #include "nvrtc.h"
    #if CUDA_VERSION < 9000
    #error "Need CUDA 9.0 SDK or higher."
    #endif
    #endif // _NBL_COMPILE_WITH_CUDA
    namespace nbl::video
    {
    class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
    {
    public:
    CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<nbl::video::IDeviceMemoryAllocation> src,
    CUexternalMemory cuExtMem) :
    m_device(device),
    m_src(src),
    m_handle(cuExtMem) {}
    ~CCUDAImportedMemory() override;
    CUexternalMemory getInternalObject() const { return m_handle; }
    CUresult getMappedBuffer(CUdeviceptr* mappedBuffer);
    private:
    core::smart_refctd_ptr<CCUDADevice> m_device;
    core::smart_refctd_ptr<IDeviceMemoryAllocation> m_src;
    CUexternalMemory m_handle;
  • CCUDAImportedSemaphore.h exposes CUexternalSemaphore and inline native handle access:
    #ifdef _NBL_COMPILE_WITH_CUDA_
    #include "cuda.h"
    #include "nvrtc.h"
    #if CUDA_VERSION < 9000
    #error "Need CUDA 9.0 SDK or higher."
    #endif
    // useful includes in the future
    //#include "cudaEGL.h"
    //#include "cudaVDPAU.h"
    namespace nbl::video
    {
    class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted
    {
    public:
    CUexternalSemaphore getInternalObject() const { return m_handle; }
    CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device,
    core::smart_refctd_ptr<ISemaphore> src,
    CUexternalSemaphore semaphore)
    : m_device(std::move(device))
    , m_src(std::move(src))
    , m_handle(semaphore)
    {}
    ~CCUDAImportedSemaphore() override;
    private:
    core::smart_refctd_ptr<CCUDADevice> m_device;
    core::smart_refctd_ptr<ISemaphore> m_src;
    CUexternalSemaphore m_handle;
  • CCUDAHandler.cpp fills m_availableDevices and writes attributes using CU_DEVICE_ATTRIBUTE_MAX from the SDK that built the binary:
    CCUDAHandler::CCUDAHandler(
    CUDA&& _cuda,
    NVRTC&& _nvrtc,
    core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers,
    core::smart_refctd_ptr<system::ILogger>&& _logger,
    int _version)
    : m_cuda(std::move(_cuda))
    , m_nvrtc(std::move(_nvrtc))
    , m_headers(std::move(_headers))
    , m_logger(std::move(_logger))
    , m_version(_version)
    {
    for (auto& header : m_headers)
    {
    m_headerContents.push_back(reinterpret_cast<const char*>(header->getMappedPointer()));
    m_headerNamesStorage.push_back(header->getFileName().string());
    m_headerNames.push_back(m_headerNamesStorage.back().c_str());
    }
    int deviceCount = 0;
    if (m_cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0)
    return;
    for (int device_i = 0; device_i < deviceCount; device_i++)
    {
    CUdevice handle = -1;
    if (m_cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0)
    continue;
    CUuuid uuid = {};
    if (m_cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS)
    continue;
    m_availableDevices.emplace_back(handle, uuid);
    int* attributes = m_availableDevices.back().attributes;
    for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++)
    m_cuda.pcuDeviceGetAttribute(attributes + i, static_cast<CUdevice_attribute>(i), handle);
    }
    }
  • CCUDAHandler.cpp later reads those cached attributes through the same SCUDADeviceInfo layout:
    core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice)
    {
    if (!vulkanConnection)
    return nullptr;
    const auto devices = vulkanConnection->getPhysicalDevices();
    if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end())
    return nullptr;
    for (const auto& device : m_availableDevices)
    {
    if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE))
    {
    CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT;
    const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR];
    const int& archMinor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR];

If we move this shape into Nabla::ext::CUDAInterop, the code no longer leaks through Nabla::Nabla, but these CUDA-native headers become the public ABI and compile-time dependency of the extension target.

A consumer then compiles those public extension headers again with its own CUDA SDK.

Inside our current Nabla examples build this may look fine. The examples are configured in the same build interface as Nabla and the CUDA interop target, so they naturally see the same CUDA::toolkit that built the extension.

But that is one project setup, not a general CMake guarantee. Another project can consume Nabla through add_subdirectory, a nested build, an external build, custom toolchain files, cache overrides, or manually provided CUDA targets and make the consumer side see a different SDK.

With an installed package this becomes normal downstream usage. Our package config does the same kind of CUDA discovery on the consumer side. It accepts Nabla_CUDA_TOOLKIT_ROOT, calls find_dependency(CUDAToolkit 13.0 REQUIRED), and links Nabla::ext::CUDAInterop to the consumer-resolved CUDA::toolkit:

if(_NBL_NABLA_LOAD_CUDA_INTEROP)
include(CMakeFindDependencyMacro)
if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "")
set(CUDAToolkit_ROOT "${Nabla_CUDA_TOOLKIT_ROOT}")
endif()
find_dependency(CUDAToolkit 13.0 REQUIRED)
_nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit)
if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake")
include("${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake")

So the extension binary may have been built with CUDA SDK A, while the downstream project resolves CUDA SDK B through its own package configure. We should not force that downstream SDK to be byte-for-byte the same SDK installed on the host that built the package. If the public extension headers contain SDK-defined layout, the downstream translation units compile that layout from SDK B while the already-built extension binary was compiled with SDK A.

Actual Risk

Raw CUDA handles like CUdevice, CUcontext, CUdeviceptr, CUexternalMemory, and CUexternalSemaphore are not the main ABI problem by themselves. They are handle-like values.

The risky part is exposing SDK-defined layout and inline access through the public extension headers.

  • int attributes[CU_DEVICE_ATTRIBUTE_MAX] makes SCUDADeviceInfo size depend on the CUDA SDK version.
  • CUDA m_cuda and NVRTC m_nvrtc make the class layout depend on generated function table members.
  • inline getters return references into those public-layout objects, so the consumer compiles member offsets and vector element stride from its own CUDA headers.

Consequences

  1. The extension ABI depends on the CUDA SDK header layout.

I decided to check this directly and installed multiple CUDA SDK versions locally. CUuuid is stable in the checked SDKs.

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\include\cuda.h:286
    typedef struct CUuuid_st {
    char bytes[16]

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\include\cuda.h:303
    typedef struct CUuuid_st {
    char bytes[16]

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include\cuda.h:324
    typedef struct CUuuid_st {
    char bytes[16]

CUdevice is also a handle-like int in the checked SDKs.

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\include\cuda.h:258
    typedef int CUdevice_v1
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\include\cuda.h:259
    typedef CUdevice_v1 CUdevice

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\include\cuda.h:266
    typedef int CUdevice_v1
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\include\cuda.h:267
    typedef CUdevice_v1 CUdevice

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include\cuda.h:287
    typedef int CUdevice_v1
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include\cuda.h:288
    typedef CUdevice_v1 CUdevice

But CU_DEVICE_ATTRIBUTE_MAX is not stable.

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\include\cuda.h:755
    CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED = 132
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\include\cuda.h:756
    CU_DEVICE_ATTRIBUTE_MAX
    so CU_DEVICE_ATTRIBUTE_MAX == 133

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\include\cuda.h:841
    CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED = 135
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\include\cuda.h:842
    CU_DEVICE_ATTRIBUTE_MAX
    so CU_DEVICE_ATTRIBUTE_MAX == 136

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include\cuda.h:965
    CU_DEVICE_ATTRIBUTE_ATOMIC_REDUCTION_SUPPORTED = 148
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include\cuda.h:966
    CU_DEVICE_ATTRIBUTE_MAX
    so CU_DEVICE_ATTRIBUTE_MAX == 149

The point is not that we want to support CUDA 12.x for this feature. The point is that NVIDIA already changed this value across minor releases inside the same major line. CUDA 12.1 has 133 and CUDA 12.5 has 136. Requiring CUDA 13.x does not show that CUDA 13.0, 13.1, 13.2, or a later 13.x SDK will keep every public-layout macro and generated function table shape identical.

So the same public struct has different sizes.

Built with CUDA 12.1

SCUDADeviceInfo
    CUdevice handle                      4 bytes
    CUuuid uuid                          16 bytes
    int attributes[133]                  532 bytes
Total                                    552 bytes with 4-byte alignment

Consumed with CUDA 13.2

SCUDADeviceInfo
    CUdevice handle                      4 bytes
    CUuuid uuid                          16 bytes
    int attributes[149]                  596 bytes
Total                                    616 bytes with 4-byte alignment

If the extension binary fills core::vector<SCUDADeviceInfo> using 552-byte elements and the consumer iterates it using 616-byte elements, the consumer walks the vector with the wrong stride. This is an ABI bug even when the extension is a DLL.

  1. DLL does not remove the public-header ABI problem.

A DLL moves implementation into another binary, but the consumer still compiles inline methods, templates, member offsets, and sizeof(SCUDADeviceInfo) from the public header.

const auto& devices = handler->getAvailableDevices()

consumer-side codegen uses sizeof(SCUDADeviceInfo) from the consumer SDK
for each device in devices
    useUuid(device.uuid)

The DLL cannot fix the fact that the consumer and the DLL disagree on the vector element type layout. A static library has the same issue unless it is rebuilt from source with the exact same CUDA SDK as the consumer.

  1. Public loader tables are also ABI surface.

In the original header, CUDA m_cuda and NVRTC m_nvrtc were members of CCUDAHandler.

Nabla's loader macro expands every listed function into a typed member.

include/nbl/system/DynamicLibraryFunctionPointer.h:88

#define NBL_SYSTEM_DECLARE_DYNLIB_FUNCPTR(FUNC_NAME) \
    nbl::system::DynamicLibraryFunctionPointer<decltype(FUNC_NAME),NBL_CORE_UNIQUE_STRING_LITERAL_TYPE(#FUNC_NAME)> p ## FUNC_NAME

include/nbl/system/DynamicFunctionCaller.h:69

NBL_FOREACH(NBL_SYSTEM_DECLARE_DYNLIB_FUNCPTR,__VA_ARGS__)

That means the function pointer type is taken directly from the CUDA/NVRTC header used by the current translation unit.

Local CUDA headers show version-sensitive API surface.

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\include\cuda.h
    cuCtxCreate_v4 not present
    cuLogsRegisterCallback not present
    cuDeviceGetUuid_v2 present

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\include\cuda.h
    cuCtxCreate_v4 present
    cuLogsRegisterCallback not present
    cuDeviceGetUuid_v2 present

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2\include\cuda.h
    #define cuCtxCreate cuCtxCreate_v4
    cuLogsRegisterCallback present at line 26475
    #define cuDeviceGetUuid cuDeviceGetUuid_v2

If the generated loader tables are public members of CCUDAHandler, then their member list and decltype(...) signatures become part of the public class layout of the extension binary.

  1. Every CCUDA*.h user gets a compile-time SDK dependency.
app.cpp
  -> nbl/ext/CUDAInterop/CCUDAHandler.h
    -> cuda.h
    -> nvrtc.h
    -> CUDA SDK headers

Updating the CUDA SDK path, version, or package can invalidate every translation unit that includes those headers. That is true even when the consumer only wants CCUDAHandler, CCUDADevice, or a helper inline method and does not directly call CUDA.

  1. The compatibility statement becomes weaker.

With CUDA SDK layout in public extension classes, the true statement is this.

The consumer can use a different CUDA SDK only if its headers remain ABI-compatible with the SDK used to build `Nabla::ext::CUDAInterop`.

It is not this.

The consumer can freely use any newer CUDA SDK with an already-built extension binary.

Build-tree examples do not show the stronger statement. They show our current single-configure examples setup where Nabla, the extension, and the examples share the same CUDA SDK discovery. Other build-interface setups, add_subdirectory consumers, installed packages, and downstream overrides are the cases where the boundary matters.

Accessor Model

The public Nabla CUDA object layout stays SDK-free.

class CCUDAHandler
{
public:
    struct SCUDADeviceInfo
    {
        std::array<uint8_t,16> uuid = {}
    }

    const core::vector<SCUDADeviceInfo>& getAvailableDevices() const

private:
    struct SNativeState
    std::unique_ptr<SNativeState> m_native
    core::vector<SCUDADeviceInfo> m_availableDevices
}

CUDA-dependent layout stays private to the binary.

struct CCUDAHandler::SNativeState
{
    CUDA cuda
    NVRTC nvrtc
    core::vector<cuda_native::SCUDADeviceInfo> availableDevices
}

Native CUDA access is still explicit opt-in.

#include <nbl/ext/CUDAInterop/CUDAInteropNative.h>

const CUDA& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*handler)
const auto& nativeDevices = cuda_native::CCUDAHandlerAccessor::getAvailableDevices(*handler)

So the plugin as Nabla extension model is workable. If it exposes the original CUDA-heavy CCUDA*.h layout publicly, we choose raw CUDA ergonomics over ABI and rebuild isolation.

The accessor/native-state approach is not a CUDA wrapper. It is a small glue layer between Nabla objects and the CUDA world, only for the interop objects that cross that boundary. Raw CUDA is still available through the explicit opt-in header.

TLDR

Yes, we can move all CUDA interop code into Nabla::ext::CUDAInterop and expose CUDA directly from the extension headers. That gives nicer raw CUDA ergonomics and avoids a few accessor calls.

Then we explicitly accept that Nabla::ext::CUDAInterop has CUDA SDK-dependent public ABI. To be safe, consumers should build against the same CUDA SDK that was used to build the extension binary, or rebuild the extension with their SDK. Otherwise layout mismatches like CU_DEVICE_ATTRIBUTE_MAX can produce real ABI bugs. Even if this is acceptable for this plugin, in my opinion it is limiting, and this is why I introduced the accessor/native-state model at the cost of slightly worse raw CUDA ergonomics.

This can pass cleanly in our current examples build because Nabla, the extension, and examples are configured together. That does not cover every build-interface setup, add_subdirectory consumer, or the package/install interface where downstream CMake may find or override a different CUDA SDK. Then the public header layout can disagree with the prebuilt extension binary.

The tradeoff is:

  • plugin with public CUDA layout means fewer glue lines and nicer direct CUDA API, but SDK becomes a public compile-time dependency of the extension and ABI compatibility depends on CUDA header compatibility.
  • accessor/native-state model means a few extra explicit calls, but stable SDK-free public Nabla layout, smaller rebuild surface, and consumer-side CUDA SDK version can move independently for native opt-in code.

We can move it into a plugin as Nabla extension if that is the preferred direction. My subjective take is that I do not like that model very much, but we can do it as long as we are explicit that this accepts the SDK-dependent public ABI and the potential mismatch, rebuild, and package issues described above.

@devshgraphicsprogramming
Copy link
Copy Markdown
Member

Excellent report, well presented.

I can see the problem now.

One thing to note that newer CUDA SDK versions are not allowed to change handle definitions like CUDeviceptr and external semaphore, otherwise app compiled against lower version SDK wouldn't run with a newer runtime.

That means the function pointer type is taken directly from the CUDA/NVRTC header used by the current translation unit.

Same thing for function signatures like cuCtx, this is why they have v2 and even v4. They cannot change the signature because CUDA Driver (not runtime) API is dynamically linked and you must be able.to run with later driver update.

Local CUDA headers show version-sensitive API surface.

What you've uncovered here is the reason why we require the SDK version to be >=13.0 because one of the function pointers in the table can only be loaded from that SDK.

Although I don't intend to provide a guarantee that every function pointer will be loaded and available, it's meant to be a quickly hacked together low effort Volk but for CUDA. Except with no global C functions or state.

The CU_DEVICE_ATTRIBUTE_MAX problem is interesting indeed, and highlights why your line of reasoning is correct.

I think there's a middle ground compromise we can come to

@devshgraphicsprogramming
Copy link
Copy Markdown
Member

Ok lets split into two targets with optional leaking headers

But instead of accessors lets abuse implicit conversions and constructors
https://godbolt.org/z/WsGqcc1f4

struct alignas(8) OpaqueCUdeviceptr
{
    uint8_t value[8];
};

//===================================LEAK BOUNDARY========================================
// CUDA SDK DEP now
//#include CUDA
using CUdeviceptr = void*;

// trait
template<typename T>
struct opaque_cuda_type;

template<>
struct opaque_cuda_type<OpaqueCUdeviceptr>
{
    using type = CUdeviceptr;
};

template<typename Opaque>
struct Wrapped
{
    using cuda_t = typename opaque_cuda_type<Opaque>::type;
    static_assert(std::is_trivial_v<cuda_t>);
    static_assert(sizeof(Opaque)==sizeof(cuda_t));
    static_assert(alignof(Opaque)==alignof(cuda_t));

    // constucotrs
    Wrapped() = default;
    Wrapped(const Wrapped&) = default;
    Wrapped(const cuda_t& val) {operator=(val);}
    Wrapped(const Opaque& val) {operator=(val);}

    // assignment
    inline Wrapped& operator=(const Wrapped&) = default;
    inline Wrapped& operator=(const cuda_t& val) {value = val; return *this;}
    inline Wrapped& operator=(const Opaque& val) {operator Opaque&() = val; return *this;}

    // immplicit convert to original value reference
    inline operator cuda_t&() {return value;}
    inline operator const cuda_t&() const {return value;}

    // implicit conver to opaque handle reference
    inline operator Opaque&() {return reinterpret_cast<Opaque&>(value);}
    inline operator const Opaque&() const {return reinterpret_cast<const Opaque&>(value);}

    cuda_t value = {};
};

// Type your code here, or load an example.
CUdeviceptr pCudaCall(CUdeviceptr);

CUdeviceptr square(int num)
{
    OpaqueCUdeviceptr op = {};

    Wrapped<OpaqueCUdeviceptr> wp(op);
    wp = op;

    wp = pCudaCall(wp);
    op = wp;

    return wp;
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants