From f4ce3dc140f5d3abee707f853574bb12bf620131 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 09:28:24 +0200
Subject: [PATCH 01/51] Move CUDA interop behind extension target

---
 CMakeLists.txt                                | 15 +--
 cmake/FindZLIB.cmake                          |  4 +-
 examples_tests                                |  2 +-
 .../{video => ext/CUDAInterop}/CCUDADevice.h  | 13 ++-
 .../CUDAInterop}/CCUDAExportableMemory.h      |  6 +-
 .../{video => ext/CUDAInterop}/CCUDAHandler.h | 70 ++++++++++++-
 .../CUDAInterop}/CCUDAImportedMemory.h        | 16 +--
 .../CUDAInterop}/CCUDAImportedSemaphore.h     |  6 +-
 include/nbl/ext/CUDAInterop/CUDAInterop.h     |  9 ++
 include/nbl/ext/OptiX/IDenoiser.h             |  4 +-
 include/nbl/system/DefaultFuncPtrLoader.h     |  4 +-
 include/nbl/video/EApiType.h                  |  6 ++
 include/nbl/video/declarations.h              |  5 +-
 src/nbl/CMakeLists.txt                        | 23 +----
 src/nbl/ext/CMakeLists.txt                    | 12 +++
 .../CUDAInterop}/CCUDADevice.cpp              |  5 +-
 .../CUDAInterop}/CCUDAExportableMemory.cpp    |  7 +-
 .../CUDAInterop}/CCUDAHandler.cpp             | 20 +++-
 .../CUDAInterop}/CCUDAImportedMemory.cpp      |  7 +-
 .../CUDAInterop}/CCUDAImportedSemaphore.cpp   |  7 +-
 src/nbl/ext/CUDAInterop/CMakeLists.txt        | 46 +++++++++
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt  | 35 +++++++
 src/nbl/ext/CUDAInterop/smoke/opt_in.cpp      | 97 +++++++++++++++++++
 .../ext/CUDAInterop/smoke/public_boundary.cpp | 15 +++
 24 files changed, 366 insertions(+), 68 deletions(-)
 rename include/nbl/{video => ext/CUDAInterop}/CCUDADevice.h (93%)
 rename include/nbl/{video => ext/CUDAInterop}/CCUDAExportableMemory.h (93%)
 rename include/nbl/{video => ext/CUDAInterop}/CCUDAHandler.h (78%)
 rename include/nbl/{video => ext/CUDAInterop}/CCUDAImportedMemory.h (74%)
 rename include/nbl/{video => ext/CUDAInterop}/CCUDAImportedSemaphore.h (90%)
 create mode 100644 include/nbl/ext/CUDAInterop/CUDAInterop.h
 rename src/nbl/{video => ext/CUDAInterop}/CCUDADevice.cpp (98%)
 rename src/nbl/{video => ext/CUDAInterop}/CCUDAExportableMemory.cpp (90%)
 rename src/nbl/{video => ext/CUDAInterop}/CCUDAHandler.cpp (97%)
 rename src/nbl/{video => ext/CUDAInterop}/CCUDAImportedMemory.cpp (84%)
 rename src/nbl/{video => ext/CUDAInterop}/CCUDAImportedSemaphore.cpp (71%)
 create mode 100644 src/nbl/ext/CUDAInterop/CMakeLists.txt
 create mode 100644 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
 create mode 100644 src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
 create mode 100644 src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fa74e167f0..ff90d862ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,9 +70,13 @@ else()
 	message(STATUS "Vulkan SDK is not found")
 endif()
 
-option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" OFF)
+option(NBL_COMPILE_WITH_CUDA "Build the CUDA interop extension?" OFF)
+set(NBL_CUDA_TOOLKIT_ROOT "" CACHE PATH "Optional CUDA Toolkit root used when NBL_COMPILE_WITH_CUDA is ON")
 
 if(NBL_COMPILE_WITH_CUDA)
+	if(NBL_CUDA_TOOLKIT_ROOT)
+		set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}" CACHE PATH "CUDA Toolkit root" FORCE)
+	endif()
 	find_package(CUDAToolkit REQUIRED)
 	if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0")
 		message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!")
@@ -183,13 +187,12 @@ option(NBL_BUILD_IMGUI "Enable nbl::ext::ImGui?" ON)
 option(NBL_BUILD_DEBUG_DRAW "Enable Nabla Debug Draw extension?" ON)
 
 option(NBL_BUILD_OPTIX "Enable nbl::ext::OptiX?" OFF)
-if(NBL_COMPILE_WITH_CUDA)
-	find_package(OPTIX REQUIRED)
-	message(STATUS "CUDA enabled and OptiX found!")
-else()
-	if(NBL_BUILD_OPTIX)
+if(NBL_BUILD_OPTIX)
+	if(NOT NBL_COMPILE_WITH_CUDA)
 		message(FATAL_ERROR "You cannot build Optix without enabled CUDA! NBL_COMPILE_WITH_CUDA must be ON!")
 	endif()
+	find_package(OPTIX REQUIRED)
+	message(STATUS "CUDA enabled and OptiX found!")
 endif()
 
 option(NBL_BUILD_BULLET "Enable Bullet Physics building and integration?" OFF)
diff --git a/cmake/FindZLIB.cmake b/cmake/FindZLIB.cmake
index f855c396b9..42aa789bee 100644
--- a/cmake/FindZLIB.cmake
+++ b/cmake/FindZLIB.cmake
@@ -4,4 +4,6 @@ endif()
 
 set(ZLIB_FOUND TRUE)
 set(ZLIB_LIBRARY ZLIB::ZLIB)
-set(ZLIB_INCLUDE_DIR "${THIRD_PARTY_SOURCE_DIR}/zlib;${THIRD_PARTY_BINARY_DIR}/zlib")
\ No newline at end of file
+set(ZLIB_LIBRARIES ZLIB::ZLIB)
+set(ZLIB_INCLUDE_DIR "${THIRD_PARTY_SOURCE_DIR}/zlib;${THIRD_PARTY_BINARY_DIR}/zlib")
+set(ZLIB_INCLUDE_DIRS "${ZLIB_INCLUDE_DIR}")
diff --git a/examples_tests b/examples_tests
index 93ca5efe58..cbb24a6404 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 93ca5efe588ca85c1eaf81a486b611df98403580
+Subproject commit cbb24a640442ace7bd01a7987f280ab0b6139e22
diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h
similarity index 93%
rename from include/nbl/video/CCUDADevice.h
rename to include/nbl/ext/CUDAInterop/CCUDADevice.h
index 02f85fdac8..d7886a4c53 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h
@@ -5,14 +5,13 @@
 #define _NBL_VIDEO_C_CUDA_DEVICE_H_
 
 
-#include "nbl/video/IPhysicalDevice.h"
-#include "nbl/video/CCUDAExportableMemory.h"
-#include "nbl/video/CCUDAImportedMemory.h"
-#include "nbl/video/CCUDAImportedSemaphore.h"
-
-
 #ifdef _NBL_COMPILE_WITH_CUDA_
 
+#include "nbl/video/declarations.h"
+#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
+
 #include "cuda.h"
 #include "nvrtc.h"
 #if CUDA_VERSION < 9000
@@ -27,7 +26,7 @@ namespace nbl::video
 {
 class CCUDAHandler;
 
-class NBL_API2 CCUDADevice : public core::IReferenceCounted
+class CCUDADevice : public core::IReferenceCounted
 {
   public:
 #ifdef _WIN32
diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
similarity index 93%
rename from include/nbl/video/CCUDAExportableMemory.h
rename to include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
index 1c3d206906..10bf911717 100644
--- a/include/nbl/video/CCUDAExportableMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
@@ -7,6 +7,8 @@
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 
+#include "nbl/video/declarations.h"
+
 #include "cuda.h"
 #include "nvrtc.h"
 #if CUDA_VERSION < 9000
@@ -22,7 +24,7 @@ namespace nbl::video
 
 class CCUDADevice;
 
-class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
+class CCUDAExportableMemory : public core::IReferenceCounted
 {
     public:
 
@@ -62,4 +64,4 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 
 #endif // _NBL_COMPILE_WITH_CUDA_
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
similarity index 78%
rename from include/nbl/video/CCUDAHandler.h
rename to include/nbl/ext/CUDAInterop/CCUDAHandler.h
index 61e9522a66..8c86d9102c 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
@@ -9,7 +9,7 @@
 
 #include "nbl/system/declarations.h"
 
-#include "nbl/video/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDADevice.h"
 
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
@@ -17,7 +17,7 @@ namespace nbl::video
 {
 
 
-class NBL_API2 CCUDAHandler : public core::IReferenceCounted
+class CCUDAHandler : public core::IReferenceCounted
 {
 		public:
 		static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
@@ -151,6 +151,8 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 			nvrtcCreateProgram,
 			nvrtcDestroyProgram,
 			nvrtcGetLoweredName,
+			nvrtcGetCUBIN,
+			nvrtcGetCUBINSize,
 			nvrtcGetPTX,
 			nvrtcGetPTXSize,
 			nvrtcGetProgramLog,
@@ -216,6 +218,13 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 		};
 		ptx_and_nvrtcResult_t getPTX(nvrtcProgram prog);
 
+		struct cubin_and_nvrtcResult_t
+		{
+			core::smart_refctd_ptr<asset::ICPUBuffer> cubin;
+			nvrtcResult result;
+		};
+		cubin_and_nvrtcResult_t getCUBIN(nvrtcProgram prog);
+
 		//
 		inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 			std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
@@ -260,6 +269,49 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 			return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log);
 		}
 
+		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN(
+			std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+			std::string* log=nullptr
+		)
+		{
+			nvrtcProgram program = nullptr;
+			nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
+			auto cleanup = core::makeRAIIExiter([&]() -> void
+			{
+				if (result!=NVRTC_SUCCESS && program)
+					m_nvrtc.pnvrtcDestroyProgram(&program);
+			});
+
+			result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames);
+			return compileDirectlyToCUBIN_impl(result,program,nvrtcOptions,log);
+		}
+		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN(
+			const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+			std::string* log=nullptr
+		)
+		{
+			return compileDirectlyToCUBIN(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+		}
+		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN(
+			system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+			std::string* log=nullptr
+		)
+		{
+			nvrtcProgram program = nullptr;
+			nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
+			auto cleanup = core::makeRAIIExiter([&]() -> void
+			{
+				if (result!=NVRTC_SUCCESS && program)
+					m_nvrtc.pnvrtcDestroyProgram(&program);
+			});
+
+			result = createProgram(&program,file,headerCount,headerContents,includeNames);
+			return compileDirectlyToCUBIN_impl(result,program,nvrtcOptions,log);
+		}
+
 		core::smart_refctd_ptr<CCUDADevice> createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice);
 
 	protected:
@@ -281,6 +333,20 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 			return getPTX(program);
 		}
 
+		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN_impl(nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
+		{
+			if (result!=NVRTC_SUCCESS)
+				return {nullptr,result};
+
+			result = compileProgram(program,nvrtcOptions);
+			if (log)
+				getProgramLog(program,*log);
+			if (result!=NVRTC_SUCCESS)
+				return {nullptr,result};
+			
+			return getCUBIN(program);
+		}
+
 		// function tables
 		CUDA m_cuda;
 		NVRTC m_nvrtc;
diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
similarity index 74%
rename from include/nbl/video/CCUDAImportedMemory.h
rename to include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
index 4e3bfcd085..5f885abd2d 100644
--- a/include/nbl/video/CCUDAImportedMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
@@ -1,20 +1,22 @@
-#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H
-#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H
+#ifndef _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_
+#define _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 
+#include "nbl/video/declarations.h"
+
 #include "cuda.h"
 #include "nvrtc.h"
 #if CUDA_VERSION < 9000
   #error "Need CUDA 9.0 SDK or higher."
 #endif
 
-#endif // _NBL_COMPILE_WITH_CUDA
-
 namespace nbl::video
 {
 
-class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
+class CCUDADevice;
+
+class CCUDAImportedMemory : public core::IReferenceCounted
 {
     public:
 
@@ -39,4 +41,6 @@ class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
 
 }
 
-#endif
\ No newline at end of file
+#endif // _NBL_COMPILE_WITH_CUDA_
+
+#endif
diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
similarity index 90%
rename from include/nbl/video/CCUDAImportedSemaphore.h
rename to include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
index 2e5010fa2d..409ef1a676 100644
--- a/include/nbl/video/CCUDAImportedSemaphore.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
@@ -6,6 +6,8 @@
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 
+#include "nbl/video/declarations.h"
+
 #include "cuda.h"
 #include "nvrtc.h"
 #if CUDA_VERSION < 9000
@@ -19,7 +21,9 @@
 namespace nbl::video
 {
 
-class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted
+class CCUDADevice;
+
+class CCUDAImportedSemaphore : public core::IReferenceCounted
 {
     public:
 
diff --git a/include/nbl/ext/CUDAInterop/CUDAInterop.h b/include/nbl/ext/CUDAInterop/CUDAInterop.h
new file mode 100644
index 0000000000..b30d096049
--- /dev/null
+++ b/include/nbl/ext/CUDAInterop/CUDAInterop.h
@@ -0,0 +1,9 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_EXT_CUDA_INTEROP_H_INCLUDED_
+#define _NBL_EXT_CUDA_INTEROP_H_INCLUDED_
+
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+
+#endif
diff --git a/include/nbl/ext/OptiX/IDenoiser.h b/include/nbl/ext/OptiX/IDenoiser.h
index 7820aa1222..496383d92d 100644
--- a/include/nbl/ext/OptiX/IDenoiser.h
+++ b/include/nbl/ext/OptiX/IDenoiser.h
@@ -5,7 +5,7 @@
 #ifndef __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__
 #define __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__
 
-#include "../../../../src/nbl/video/CCUDAHandler.h"
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
 
 #include <optix.h>
 #include <optix_denoiser_tiling.h>
@@ -122,4 +122,4 @@ class IDenoiser final : public core::IReferenceCounted
 }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/system/DefaultFuncPtrLoader.h b/include/nbl/system/DefaultFuncPtrLoader.h
index 56142448c8..bbb9884e7a 100644
--- a/include/nbl/system/DefaultFuncPtrLoader.h
+++ b/include/nbl/system/DefaultFuncPtrLoader.h
@@ -35,9 +35,9 @@ class DefaultFuncPtrLoader final : FuncPtrLoader
 			return lib!=nullptr;
 		}
 
-		void* loadFuncPtr(const char* funcname) override final;
+		NBL_API2 void* loadFuncPtr(const char* funcname) override final;
 };
 
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h
index 7f99d40309..db29abe54d 100644
--- a/include/nbl/video/EApiType.h
+++ b/include/nbl/video/EApiType.h
@@ -4,6 +4,12 @@
 #include "nbl/core/declarations.h"
 #include <cstdint>
 
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
 namespace nbl::video
 {
 
diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h
index 37f2f864bf..4393af1768 100644
--- a/include/nbl/video/declarations.h
+++ b/include/nbl/video/declarations.h
@@ -24,9 +24,6 @@
 #include "nbl/video/CVulkanImage.h"
 #include "nbl/video/surface/CSurfaceVulkan.h"
 
-// CUDA
-#include "nbl/video/CCUDAHandler.h"
-
 // utilities
 #include "nbl/video/utilities/CDumbPresentationOracle.h"
 #include "nbl/video/utilities/ICommandPoolCache.h"
@@ -44,4 +41,4 @@
 //#include "nbl/video/IGPUVirtualTexture.h"
 
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 692efec8bd..de9bde3952 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -95,12 +95,8 @@ configure_file("${NBL_ROOT_PATH}/include/nbl/config/BuildConfigOptions.h.in" "${
 file(GENERATE OUTPUT "${CONFIG_OUTPUT}" INPUT "${CONFIG_DIRECOTORY}/.int/BuildConfigOptions.h.conf")
 nbl_install_file_spec("${CONFIG_OUTPUT}" nbl/config)
 
-if (NBL_COMPILE_WITH_CUDA)
-	message(STATUS "Building with CUDA interop")
-	set(_NBL_COMPILE_WITH_CUDA_ ${NBL_COMPILE_WITH_CUDA})
-	if (NBL_BUILD_OPTIX)
-		set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX})
-	endif()
+if (NBL_BUILD_OPTIX)
+	set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX})
 endif()
 
 # => TODO: clean!
@@ -291,12 +287,6 @@ set(NBL_VIDEO_SOURCES
 	video/CVulkanEvent.cpp
 	video/CSurfaceVulkan.cpp
 	
-# CUDA
-	video/CCUDAHandler.cpp
-	video/CCUDADevice.cpp
-	video/CCUDAImportedSemaphore.cpp
-	video/CCUDAExportableMemory.cpp
-	video/CCUDAImportedMemory.cpp
 )
 
 set(NBL_SCENE_SOURCES
@@ -425,10 +415,6 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES)
 	target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES)
 endif()
 
-if(NBL_COMPILE_WITH_CUDA)
-	target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_)
-endif()
-
 set(INTERFACE_BUILD_DEFINITIONS
 	_DXC_DLL_="${DXC_DLL}"
 )
@@ -664,11 +650,6 @@ target_link_libraries(Nabla PRIVATE volk)
 # volk is part of public interface headers in Nabla
 target_compile_definitions(Nabla PUBLIC $<$<PLATFORM_ID:Windows>:VK_USE_PLATFORM_WIN32_KHR>)
 
-# CUDA
-if (NBL_COMPILE_WITH_CUDA)
-	list(APPEND PUBLIC_BUILD_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRS}")
-endif()
-
 list(APPEND PUBLIC_BUILD_INCLUDE_DIRS
 	# this should be PRIVATE, but things from /src (or /source) are sometimes included in things in /include and so examples have to put source dirs into theirs Include Path
 	# -> TODO
diff --git a/src/nbl/ext/CMakeLists.txt b/src/nbl/ext/CMakeLists.txt
index f3b55531c2..1f815413e8 100644
--- a/src/nbl/ext/CMakeLists.txt
+++ b/src/nbl/ext/CMakeLists.txt
@@ -38,6 +38,18 @@ if (NBL_BUILD_OPTIX)
     )
 endif()
 
+add_subdirectory(CUDAInterop)
+if (NBL_COMPILE_WITH_CUDA)
+    set(NBL_EXT_CUDA_INTEROP_INCLUDE_DIRS
+        ${NBL_EXT_CUDA_INTEROP_INCLUDE_DIRS}
+        PARENT_SCOPE
+    )
+    set(NBL_EXT_CUDA_INTEROP_LIB
+        ${NBL_EXT_CUDA_INTEROP_LIB}
+        PARENT_SCOPE
+    )
+endif()
+
 if (NBL_BUILD_IMGUI)
     add_subdirectory(ImGui)
     set(NBL_EXT_IMGUI_UI_INCLUDE_DIRS
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
similarity index 98%
rename from src/nbl/video/CCUDADevice.cpp
rename to src/nbl/ext/CUDAInterop/CCUDADevice.cpp
index 27f8f6f906..aa06c6e7bf 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
@@ -1,13 +1,14 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "nbl/video/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
 
 #ifdef _WIN32
 #include <winternl.h>
 #endif
 
-#include "nbl/video/CCUDAImportedMemory.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
similarity index 90%
rename from src/nbl/video/CCUDAExportableMemory.cpp
rename to src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
index 66cbbdcf4f..65afdca660 100644
--- a/src/nbl/video/CCUDAExportableMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
@@ -2,8 +2,9 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/video/CCUDAExportableMemory.h"
-#include "nbl/video/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
+#include "nbl/ext/CUDAInterop/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
@@ -51,4 +52,4 @@ CCUDAExportableMemory::~CCUDAExportableMemory()
 }
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
\ No newline at end of file
+#endif // _NBL_COMPILE_WITH_CUDA_
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
similarity index 97%
rename from src/nbl/video/CCUDAHandler.cpp
rename to src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 060afe6631..f9048d3bb6 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/video/CCUDAHandler.h"
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
 #include "nbl/system/CFileView.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
@@ -488,7 +488,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 	{\
 		if (!cuda.p ## FUNC)\
 			return nullptr;\
-		auto result = cuda.p ## FUNC ## (__VA_ARGS__);\
+		auto result = cuda.p ## FUNC(__VA_ARGS__);\
 		if (result!=CUDA_SUCCESS)\
 			return nullptr;\
 	}
@@ -570,6 +570,22 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog)
 	return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
 }
 
+CCUDAHandler::cubin_and_nvrtcResult_t CCUDAHandler::getCUBIN(nvrtcProgram prog)
+{
+	size_t _size = 0ull;
+	nvrtcResult sizeRes = m_nvrtc.pnvrtcGetCUBINSize(prog,&_size);
+	if (sizeRes!=NVRTC_SUCCESS)
+		return {nullptr,sizeRes};
+	if (_size==0ull)
+		return {nullptr,NVRTC_ERROR_INVALID_INPUT};
+
+	asset::ICPUBuffer::SCreationParams cubinParams = {};
+	cubinParams.size = _size;
+	auto cubin = asset::ICPUBuffer::create(std::move(cubinParams));
+	auto cubinPtr = static_cast<char*>(cubin->getPointer());
+	return {std::move(cubin),m_nvrtc.pnvrtcGetCUBIN(prog,cubinPtr)};
+}
+
 core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice)
 {
 	if (!vulkanConnection)
diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
similarity index 84%
rename from src/nbl/video/CCUDAImportedMemory.cpp
rename to src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
index 7e21b05ef1..a785bad9b9 100644
--- a/src/nbl/video/CCUDAImportedMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
@@ -2,8 +2,9 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/video/CCUDAImportedMemory.h"
-#include "nbl/video/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
+#include "nbl/ext/CUDAInterop/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 
@@ -29,4 +30,4 @@ CCUDAImportedMemory::~CCUDAImportedMemory()
 
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
similarity index 71%
rename from src/nbl/video/CCUDAImportedSemaphore.cpp
rename to src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
index 0dc750a4a9..1ca4a34190 100644
--- a/src/nbl/video/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
@@ -2,8 +2,9 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/video/CCUDAImportedSemaphore.h"
-#include "nbl/video/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
+#include "nbl/ext/CUDAInterop/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
@@ -15,4 +16,4 @@ CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
 }
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
\ No newline at end of file
+#endif // _NBL_COMPILE_WITH_CUDA_
diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt
new file mode 100644
index 0000000000..d3f8e85169
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt
@@ -0,0 +1,46 @@
+include(${NBL_ROOT_PATH}/cmake/common.cmake)
+
+if (NBL_COMPILE_WITH_CUDA)
+	set(NBL_EXT_INTERNAL_INCLUDE_DIR "${NBL_ROOT_PATH}/include/nbl/ext/CUDAInterop")
+
+	set(NBL_EXT_CUDA_INTEROP_H
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInterop.h
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDADevice.h
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAExportableMemory.h
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAHandler.h
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAImportedMemory.h
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAImportedSemaphore.h
+	)
+
+	set(NBL_EXT_CUDA_INTEROP_SRC
+		CCUDADevice.cpp
+		CCUDAExportableMemory.cpp
+		CCUDAHandler.cpp
+		CCUDAImportedMemory.cpp
+		CCUDAImportedSemaphore.cpp
+	)
+
+	nbl_create_ext_library_project(
+		CUDA_INTEROP
+		"${NBL_EXT_CUDA_INTEROP_H}"
+		"${NBL_EXT_CUDA_INTEROP_SRC}"
+		""
+		""
+		"_NBL_COMPILE_WITH_CUDA_"
+	)
+
+	set(NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
+	if(CUDAToolkit_ROOT)
+		list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS
+			"${CUDAToolkit_ROOT}/include"
+			"${CUDAToolkit_ROOT}/include/cccl"
+		)
+	endif()
+	list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
+	list(REMOVE_DUPLICATES NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
+
+	target_include_directories(${LIB_NAME} BEFORE PUBLIC ${NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS})
+	add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME})
+endif()
+
+add_subdirectory(smoke)
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
new file mode 100644
index 0000000000..7805153e32
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -0,0 +1,35 @@
+enable_testing()
+
+set(_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS)
+if(CMAKE_CONFIGURATION_TYPES)
+	set(_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS --config $<CONFIG>)
+endif()
+
+function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE)
+	add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+	target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20)
+	nbl_adjust_flags(TARGET ${TARGET_NAME} MAP_RELEASE Release MAP_RELWITHDEBINFO RelWithDebInfo MAP_DEBUG Debug)
+
+	set(_NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:$<TARGET_FILE_DIR:Nabla::Nabla>")
+	if(CUDAToolkit_BIN_DIR)
+		list(APPEND _NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:${CUDAToolkit_BIN_DIR}")
+	endif()
+
+	add_test(
+		NAME ${TARGET_NAME}.build
+		COMMAND ${CMAKE_COMMAND} --build "${CMAKE_CURRENT_BINARY_DIR}" --target ${TARGET_NAME} ${_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS}
+	)
+	add_test(NAME ${TARGET_NAME}.run COMMAND $<TARGET_FILE:${TARGET_NAME}>)
+	set_tests_properties(${TARGET_NAME}.run PROPERTIES
+		DEPENDS ${TARGET_NAME}.build
+		ENVIRONMENT_MODIFICATION "${_NBL_CUDA_INTEROP_SMOKE_PATH_MODS}"
+	)
+endfunction()
+
+nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp)
+target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla)
+
+if(TARGET Nabla::ext::CUDAInterop)
+	nbl_add_cuda_interop_smoke(NblExtCUDAInteropOptInSmoke opt_in.cpp)
+	target_link_libraries(NblExtCUDAInteropOptInSmoke PRIVATE Nabla::ext::CUDAInterop)
+endif()
diff --git a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
new file mode 100644
index 0000000000..d6afab79d2
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
@@ -0,0 +1,97 @@
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <type_traits>
+
+#ifndef _NBL_COMPILE_WITH_CUDA_
+#error "CUDA interop consumers must opt in through Nabla::ext::CUDAInterop."
+#endif
+
+namespace
+{
+using namespace nbl;
+using namespace nbl::video;
+
+[[maybe_unused]] bool compileVulkanCudaInteropRecipe(
+	CCUDADevice& cudaDevice,
+	ILogicalDevice* vulkanDevice,
+	core::smart_refctd_ptr<IDeviceMemoryAllocation> vulkanMemory,
+	core::smart_refctd_ptr<ISemaphore> vulkanSemaphore)
+{
+	auto cudaMemory = cudaDevice.createExportableMemory({
+		.size = 4096,
+		.alignment = 4096,
+		.location = CU_MEM_LOCATION_TYPE_DEVICE,
+	});
+	if (!cudaMemory)
+		return false;
+
+	auto exportedToVulkan = cudaMemory->exportAsMemory(vulkanDevice);
+	auto importedFromVulkan = cudaDevice.importExternalMemory(std::move(vulkanMemory));
+	auto importedSemaphore = cudaDevice.importExternalSemaphore(std::move(vulkanSemaphore));
+
+	CUdeviceptr mappedVulkanMemory = 0;
+	if (importedFromVulkan)
+		importedFromVulkan->getMappedBuffer(&mappedVulkanMemory);
+
+	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? importedSemaphore->getInternalObject():nullptr;
+	return exportedToVulkan.get() && mappedVulkanMemory && cudaSemaphore;
+}
+
+bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
+{
+	auto& cuda = handler.getCUDAFunctionTable();
+
+	CUcontext context = nullptr;
+	if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS)
+		return false;
+
+	CUcontext poppedContext = nullptr;
+	auto releaseContext = [&]()
+	{
+		if (context)
+		{
+			cuda.pcuCtxPopCurrent_v2(&poppedContext);
+			cuda.pcuDevicePrimaryCtxRelease_v2(device);
+		}
+	};
+
+	if (cuda.pcuCtxPushCurrent_v2(context)!=CUDA_SUCCESS)
+	{
+		releaseContext();
+		return false;
+	}
+
+	constexpr std::array<uint32_t, 4> input = {0x12345678u, 0x90abcdefu, 0xfedcba09u, 0x87654321u};
+	std::array<uint32_t, input.size()> output = {};
+
+	CUdeviceptr deviceMemory = 0;
+	bool ok = cuda.pcuMemAlloc_v2(&deviceMemory, sizeof(input))==CUDA_SUCCESS;
+	ok = ok && cuda.pcuMemcpyHtoD_v2(deviceMemory, input.data(), sizeof(input))==CUDA_SUCCESS;
+	ok = ok && cuda.pcuMemcpyDtoH_v2(output.data(), deviceMemory, sizeof(output))==CUDA_SUCCESS;
+	if (deviceMemory)
+		ok = cuda.pcuMemFree_v2(deviceMemory)==CUDA_SUCCESS && ok;
+
+	releaseContext();
+	return ok && std::ranges::equal(input, output);
+}
+}
+
+int main()
+{
+	static_assert(std::is_same_v<decltype(std::declval<const nbl::video::CCUDADevice&>().getInternalObject()), CUdevice>);
+	CUdeviceptr devicePtr = 0;
+	static_cast<void>(devicePtr);
+
+	auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr);
+	if (!handler)
+		return 0;
+
+	const auto& devices = handler->getAvailableDevices();
+	if (devices.empty())
+		return 0;
+
+	return cudaDriverRoundtrip(*handler, devices.front().handle) ? 0:1;
+}
diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
new file mode 100644
index 0000000000..809d1e7b93
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
@@ -0,0 +1,15 @@
+#include "nabla.h"
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
+#error "Default Nabla consumers must not get the CUDA opt-in define."
+#endif
+
+#ifdef CUDA_VERSION
+#error "Default Nabla consumers must not include CUDA SDK headers."
+#endif
+
+int main()
+{
+	return 0;
+}

From 78845ae3f2bfb360316aab2f905d0b415165d52c Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 11:43:32 +0200
Subject: [PATCH 02/51] Address CUDA interop review cleanup

---
 CMakeLists.txt                                |  2 +-
 examples_tests                                |  2 +-
 include/nbl/ext/CUDAInterop/CCUDAHandler.h    | 66 -------------------
 include/nbl/ext/CUDAInterop/CUDAInterop.h     |  4 ++
 include/nbl/system/DefaultFuncPtrLoader.h     |  8 +--
 include/nbl/video/EApiType.h                  | 31 +--------
 src/nbl/CMakeLists.txt                        |  1 +
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      | 16 -----
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt  | 36 ++++++----
 src/nbl/ext/CUDAInterop/smoke/opt_in.cpp      | 50 ++++++++++----
 .../ext/CUDAInterop/smoke/public_boundary.cpp | 24 ++++++-
 src/nbl/video/EApiType.cpp                    | 37 +++++++++++
 12 files changed, 130 insertions(+), 147 deletions(-)
 create mode 100644 src/nbl/video/EApiType.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff90d862ce..c5e1bfac20 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,7 +75,7 @@ set(NBL_CUDA_TOOLKIT_ROOT "" CACHE PATH "Optional CUDA Toolkit root used when NB
 
 if(NBL_COMPILE_WITH_CUDA)
 	if(NBL_CUDA_TOOLKIT_ROOT)
-		set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}" CACHE PATH "CUDA Toolkit root" FORCE)
+		set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}")
 	endif()
 	find_package(CUDAToolkit REQUIRED)
 	if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0")
diff --git a/examples_tests b/examples_tests
index cbb24a6404..5c604d274b 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit cbb24a640442ace7bd01a7987f280ab0b6139e22
+Subproject commit 5c604d274b8aac99d8855f5b7aaf615910c8a5f6
diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
index 8c86d9102c..5128aad575 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
@@ -151,8 +151,6 @@ class CCUDAHandler : public core::IReferenceCounted
 			nvrtcCreateProgram,
 			nvrtcDestroyProgram,
 			nvrtcGetLoweredName,
-			nvrtcGetCUBIN,
-			nvrtcGetCUBINSize,
 			nvrtcGetPTX,
 			nvrtcGetPTXSize,
 			nvrtcGetProgramLog,
@@ -218,13 +216,6 @@ class CCUDAHandler : public core::IReferenceCounted
 		};
 		ptx_and_nvrtcResult_t getPTX(nvrtcProgram prog);
 
-		struct cubin_and_nvrtcResult_t
-		{
-			core::smart_refctd_ptr<asset::ICPUBuffer> cubin;
-			nvrtcResult result;
-		};
-		cubin_and_nvrtcResult_t getCUBIN(nvrtcProgram prog);
-
 		//
 		inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 			std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
@@ -269,49 +260,6 @@ class CCUDAHandler : public core::IReferenceCounted
 			return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log);
 		}
 
-		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN(
-			std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
-		)
-		{
-			nvrtcProgram program = nullptr;
-			nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
-			auto cleanup = core::makeRAIIExiter([&]() -> void
-			{
-				if (result!=NVRTC_SUCCESS && program)
-					m_nvrtc.pnvrtcDestroyProgram(&program);
-			});
-
-			result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames);
-			return compileDirectlyToCUBIN_impl(result,program,nvrtcOptions,log);
-		}
-		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN(
-			const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
-		)
-		{
-			return compileDirectlyToCUBIN(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-		}
-		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN(
-			system::IFile* file, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
-		)
-		{
-			nvrtcProgram program = nullptr;
-			nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
-			auto cleanup = core::makeRAIIExiter([&]() -> void
-			{
-				if (result!=NVRTC_SUCCESS && program)
-					m_nvrtc.pnvrtcDestroyProgram(&program);
-			});
-
-			result = createProgram(&program,file,headerCount,headerContents,includeNames);
-			return compileDirectlyToCUBIN_impl(result,program,nvrtcOptions,log);
-		}
-
 		core::smart_refctd_ptr<CCUDADevice> createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice);
 
 	protected:
@@ -333,20 +281,6 @@ class CCUDAHandler : public core::IReferenceCounted
 			return getPTX(program);
 		}
 
-		inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN_impl(nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
-		{
-			if (result!=NVRTC_SUCCESS)
-				return {nullptr,result};
-
-			result = compileProgram(program,nvrtcOptions);
-			if (log)
-				getProgramLog(program,*log);
-			if (result!=NVRTC_SUCCESS)
-				return {nullptr,result};
-			
-			return getCUBIN(program);
-		}
-
 		// function tables
 		CUDA m_cuda;
 		NVRTC m_nvrtc;
diff --git a/include/nbl/ext/CUDAInterop/CUDAInterop.h b/include/nbl/ext/CUDAInterop/CUDAInterop.h
index b30d096049..06d9016dc8 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInterop.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInterop.h
@@ -4,6 +4,10 @@
 #ifndef _NBL_EXT_CUDA_INTEROP_H_INCLUDED_
 #define _NBL_EXT_CUDA_INTEROP_H_INCLUDED_
 
+#include "nbl/ext/CUDAInterop/CCUDADevice.h"
+#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
 #include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
+#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
 
 #endif
diff --git a/include/nbl/system/DefaultFuncPtrLoader.h b/include/nbl/system/DefaultFuncPtrLoader.h
index bbb9884e7a..10fab3a454 100644
--- a/include/nbl/system/DefaultFuncPtrLoader.h
+++ b/include/nbl/system/DefaultFuncPtrLoader.h
@@ -11,18 +11,18 @@
 namespace nbl::system
 {
 
-class DefaultFuncPtrLoader final : FuncPtrLoader
+class NBL_API2 DefaultFuncPtrLoader final : FuncPtrLoader
 {
 		void* lib;
 
 	public:
 		inline DefaultFuncPtrLoader() : lib(nullptr) {}
-		NBL_API2 DefaultFuncPtrLoader(const char* name);
+		DefaultFuncPtrLoader(const char* name);
 		inline DefaultFuncPtrLoader(DefaultFuncPtrLoader&& other) : DefaultFuncPtrLoader()
 		{
 			operator=(std::move(other));
 		}
-		NBL_API2 ~DefaultFuncPtrLoader();
+		~DefaultFuncPtrLoader();
 
 		inline DefaultFuncPtrLoader& operator=(DefaultFuncPtrLoader&& other)
 		{
@@ -35,7 +35,7 @@ class DefaultFuncPtrLoader final : FuncPtrLoader
 			return lib!=nullptr;
 		}
 
-		NBL_API2 void* loadFuncPtr(const char* funcname) override final;
+		void* loadFuncPtr(const char* funcname) override final;
 };
 
 }
diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h
index db29abe54d..44a31ecf90 100644
--- a/include/nbl/video/EApiType.h
+++ b/include/nbl/video/EApiType.h
@@ -4,12 +4,6 @@
 #include "nbl/core/declarations.h"
 #include <cstdint>
 
-#ifdef _WIN32
-#include <windows.h>
-#else
-#include <unistd.h>
-#endif
-
 namespace nbl::video
 {
 
@@ -34,29 +28,8 @@ constexpr external_handle_t ExternalHandleNull = nullptr;
 constexpr external_handle_t ExternalHandleNull = -1;
 #endif
 
-inline bool CloseExternalHandle(external_handle_t handle)
-{
-#ifdef _WIN32
-    return CloseHandle(handle);
-#else
-    return (close(handle) == 0);
-#endif
-}
-
-inline external_handle_t DuplicateExternalHandle(external_handle_t handle)
-{
-#ifdef _WIN32
-    HANDLE re = ExternalHandleNull;
-
-    const HANDLE cur = GetCurrentProcess();
-    if (!DuplicateHandle(cur, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS))
-        return ExternalHandleNull;
-
-    return re;
-#else
-    return dup(handle);
-#endif
-}
+NBL_API2 bool CloseExternalHandle(external_handle_t handle);
+NBL_API2 external_handle_t DuplicateExternalHandle(external_handle_t handle);
 
 }
 
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index de9bde3952..acbf4d4dda 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -248,6 +248,7 @@ set(NBL_VIDEO_SOURCES
 	video/IGPUAccelerationStructure.cpp
 	video/IGPUCommandBuffer.cpp
 	video/IQueue.cpp
+	video/EApiType.cpp
 	video/IGPUDescriptorSet.cpp
 	video/IDeviceMemoryAllocation.cpp
 	video/IDeviceMemoryBacked.cpp
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index f9048d3bb6..748a88d1a1 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -570,22 +570,6 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog)
 	return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
 }
 
-CCUDAHandler::cubin_and_nvrtcResult_t CCUDAHandler::getCUBIN(nvrtcProgram prog)
-{
-	size_t _size = 0ull;
-	nvrtcResult sizeRes = m_nvrtc.pnvrtcGetCUBINSize(prog,&_size);
-	if (sizeRes!=NVRTC_SUCCESS)
-		return {nullptr,sizeRes};
-	if (_size==0ull)
-		return {nullptr,NVRTC_ERROR_INVALID_INPUT};
-
-	asset::ICPUBuffer::SCreationParams cubinParams = {};
-	cubinParams.size = _size;
-	auto cubin = asset::ICPUBuffer::create(std::move(cubinParams));
-	auto cubinPtr = static_cast<char*>(cubin->getPointer());
-	return {std::move(cubin),m_nvrtc.pnvrtcGetCUBIN(prog,cubinPtr)};
-}
-
 core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice)
 {
 	if (!vulkanConnection)
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index 7805153e32..678cd29d84 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -1,27 +1,35 @@
-enable_testing()
-
-set(_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS)
-if(CMAKE_CONFIGURATION_TYPES)
-	set(_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS --config $<CONFIG>)
+if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+	cmake_minimum_required(VERSION 3.30)
+	project(NblExtCUDAInteropSmoke CXX)
+	find_package(Nabla REQUIRED CONFIG)
 endif()
 
+enable_testing()
+
 function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE)
-	add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+	add_executable(${TARGET_NAME} ${SOURCE_FILE})
 	target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20)
-	nbl_adjust_flags(TARGET ${TARGET_NAME} MAP_RELEASE Release MAP_RELWITHDEBINFO RelWithDebInfo MAP_DEBUG Debug)
+	if(MSVC)
+		target_compile_options(${TARGET_NAME} PRIVATE
+			/Gm-
+			/bigobj
+			/Zc:wchar_t
+			/Zc:preprocessor
+			/Zc:inline
+			/Zc:forScope
+		)
+	endif()
+	if(COMMAND nbl_adjust_flags)
+		nbl_adjust_flags(TARGET ${TARGET_NAME} MAP_RELEASE Release MAP_RELWITHDEBINFO RelWithDebInfo MAP_DEBUG Debug)
+	endif()
 
 	set(_NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:$<TARGET_FILE_DIR:Nabla::Nabla>")
 	if(CUDAToolkit_BIN_DIR)
 		list(APPEND _NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:${CUDAToolkit_BIN_DIR}")
 	endif()
 
-	add_test(
-		NAME ${TARGET_NAME}.build
-		COMMAND ${CMAKE_COMMAND} --build "${CMAKE_CURRENT_BINARY_DIR}" --target ${TARGET_NAME} ${_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS}
-	)
-	add_test(NAME ${TARGET_NAME}.run COMMAND $<TARGET_FILE:${TARGET_NAME}>)
-	set_tests_properties(${TARGET_NAME}.run PROPERTIES
-		DEPENDS ${TARGET_NAME}.build
+	add_test(NAME ${TARGET_NAME} COMMAND $<TARGET_FILE:${TARGET_NAME}>)
+	set_tests_properties(${TARGET_NAME} PROPERTIES
 		ENVIRONMENT_MODIFICATION "${_NBL_CUDA_INTEROP_SMOKE_PATH_MODS}"
 	)
 endfunction()
diff --git a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
index d6afab79d2..adcb48e6de 100644
--- a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
@@ -1,9 +1,11 @@
 #include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/system/IApplicationFramework.h"
 
 #include <algorithm>
 #include <array>
 #include <cstdint>
 #include <type_traits>
+#include <utility>
 
 #ifndef _NBL_COMPILE_WITH_CUDA_
 #error "CUDA interop consumers must opt in through Nabla::ext::CUDAInterop."
@@ -69,8 +71,10 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 
 	CUdeviceptr deviceMemory = 0;
 	bool ok = cuda.pcuMemAlloc_v2(&deviceMemory, sizeof(input))==CUDA_SUCCESS;
-	ok = ok && cuda.pcuMemcpyHtoD_v2(deviceMemory, input.data(), sizeof(input))==CUDA_SUCCESS;
-	ok = ok && cuda.pcuMemcpyDtoH_v2(output.data(), deviceMemory, sizeof(output))==CUDA_SUCCESS;
+	if (ok)
+		ok = cuda.pcuMemcpyHtoD_v2(deviceMemory,input.data(),sizeof(input))==CUDA_SUCCESS;
+	if (ok)
+		ok = cuda.pcuMemcpyDtoH_v2(output.data(),deviceMemory,sizeof(output))==CUDA_SUCCESS;
 	if (deviceMemory)
 		ok = cuda.pcuMemFree_v2(deviceMemory)==CUDA_SUCCESS && ok;
 
@@ -79,19 +83,37 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 }
 }
 
-int main()
+class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework
 {
-	static_assert(std::is_same_v<decltype(std::declval<const nbl::video::CCUDADevice&>().getInternalObject()), CUdevice>);
-	CUdeviceptr devicePtr = 0;
-	static_cast<void>(devicePtr);
+	using base_t = nbl::system::IApplicationFramework;
 
-	auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr);
-	if (!handler)
-		return 0;
+public:
+	using base_t::base_t;
 
-	const auto& devices = handler->getAvailableDevices();
-	if (devices.empty())
-		return 0;
+	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&& system) override
+	{
+		static_cast<void>(system);
 
-	return cudaDriverRoundtrip(*handler, devices.front().handle) ? 0:1;
-}
+		if (!isAPILoaded())
+			return false;
+
+		static_assert(std::is_same_v<decltype(std::declval<const nbl::video::CCUDADevice&>().getInternalObject()), CUdevice>);
+		CUdeviceptr devicePtr = 0;
+		static_cast<void>(devicePtr);
+
+		auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr);
+		if (!handler)
+			return true;
+
+		const auto& devices = handler->getAvailableDevices();
+		if (devices.empty())
+			return true;
+
+		return cudaDriverRoundtrip(*handler, devices.front().handle);
+	}
+
+	void workLoopBody() override {}
+	bool keepRunning() override { return false; }
+};
+
+NBL_MAIN_FUNC(CUDAInteropOptInSmoke)
diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
index 809d1e7b93..c39ba076d4 100644
--- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
@@ -1,4 +1,5 @@
 #include "nabla.h"
+#include "nbl/system/IApplicationFramework.h"
 #include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
@@ -9,7 +10,26 @@
 #error "Default Nabla consumers must not include CUDA SDK headers."
 #endif
 
-int main()
+namespace
 {
-	return 0;
+
+class CUDAInteropPublicBoundarySmoke final : public nbl::system::IApplicationFramework
+{
+	using base_t = nbl::system::IApplicationFramework;
+
+public:
+	using base_t::base_t;
+
+	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&& system) override
+	{
+		static_cast<void>(system);
+		return isAPILoaded();
+	}
+
+	void workLoopBody() override {}
+	bool keepRunning() override { return false; }
+};
+
 }
+
+NBL_MAIN_FUNC(CUDAInteropPublicBoundarySmoke)
diff --git a/src/nbl/video/EApiType.cpp b/src/nbl/video/EApiType.cpp
new file mode 100644
index 0000000000..d7eadd8b08
--- /dev/null
+++ b/src/nbl/video/EApiType.cpp
@@ -0,0 +1,37 @@
+#include "nbl/video/EApiType.h"
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
+namespace nbl::video
+{
+
+bool CloseExternalHandle(external_handle_t handle)
+{
+#ifdef _WIN32
+	return CloseHandle(handle);
+#else
+	return close(handle)==0;
+#endif
+}
+
+external_handle_t DuplicateExternalHandle(external_handle_t handle)
+{
+#ifdef _WIN32
+	HANDLE duplicated = ExternalHandleNull;
+
+	const HANDLE process = GetCurrentProcess();
+	if (!DuplicateHandle(process,handle,process,&duplicated,GENERIC_ALL,0,DUPLICATE_SAME_ACCESS))
+		return ExternalHandleNull;
+
+	return duplicated;
+#else
+	return dup(handle);
+#endif
+}
+
+}

From ab9a7e560fadaf960a1a9f4879a02f6e66833d2a Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 12:04:30 +0200
Subject: [PATCH 03/51] Simplify CUDA interop smoke CMake

---
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index 678cd29d84..89dd821add 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -1,6 +1,9 @@
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
 	cmake_minimum_required(VERSION 3.30)
 	project(NblExtCUDAInteropSmoke CXX)
+endif()
+
+if(NOT TARGET Nabla::Nabla)
 	find_package(Nabla REQUIRED CONFIG)
 endif()
 
@@ -19,19 +22,8 @@ function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE)
 			/Zc:forScope
 		)
 	endif()
-	if(COMMAND nbl_adjust_flags)
-		nbl_adjust_flags(TARGET ${TARGET_NAME} MAP_RELEASE Release MAP_RELWITHDEBINFO RelWithDebInfo MAP_DEBUG Debug)
-	endif()
-
-	set(_NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:$<TARGET_FILE_DIR:Nabla::Nabla>")
-	if(CUDAToolkit_BIN_DIR)
-		list(APPEND _NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:${CUDAToolkit_BIN_DIR}")
-	endif()
 
 	add_test(NAME ${TARGET_NAME} COMMAND $<TARGET_FILE:${TARGET_NAME}>)
-	set_tests_properties(${TARGET_NAME} PROPERTIES
-		ENVIRONMENT_MODIFICATION "${_NBL_CUDA_INTEROP_SMOKE_PATH_MODS}"
-	)
 endfunction()
 
 nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp)

From bf8eeb3509935dd7f0b5970e87a44ea88bf5a4fb Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 12:27:26 +0200
Subject: [PATCH 04/51] Clean CUDA interop smoke usage requirements

---
 src/nbl/CMakeLists.txt                           |  7 +++++--
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt     | 16 ++--------------
 src/nbl/ext/CUDAInterop/smoke/opt_in.cpp         |  6 +-----
 .../ext/CUDAInterop/smoke/public_boundary.cpp    |  7 +++----
 4 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index acbf4d4dda..bb96bdfc80 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -763,8 +763,11 @@ if(TARGET ngfx)
 	)
 endif()
 
-# on MSVC it won't compile without this option!
-target_compile_options(Nabla PUBLIC $<$<CXX_COMPILER_FRONTEND_VARIANT:MSVC>:/bigobj>)
+# on MSVC it won't compile without these options!
+target_compile_options(Nabla PUBLIC
+	$<$<CXX_COMPILER_FRONTEND_VARIANT:MSVC>:/bigobj>
+	$<$<CXX_COMPILER_FRONTEND_VARIANT:MSVC>:/Zc:preprocessor>
+)
 
 if(NBL_PCH)
 	target_precompile_headers(Nabla
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index 89dd821add..23dd6d5422 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -1,7 +1,5 @@
-if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
-	cmake_minimum_required(VERSION 3.30)
-	project(NblExtCUDAInteropSmoke CXX)
-endif()
+cmake_minimum_required(VERSION 3.30)
+project(NblExtCUDAInteropSmoke CXX)
 
 if(NOT TARGET Nabla::Nabla)
 	find_package(Nabla REQUIRED CONFIG)
@@ -12,16 +10,6 @@ enable_testing()
 function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE)
 	add_executable(${TARGET_NAME} ${SOURCE_FILE})
 	target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20)
-	if(MSVC)
-		target_compile_options(${TARGET_NAME} PRIVATE
-			/Gm-
-			/bigobj
-			/Zc:wchar_t
-			/Zc:preprocessor
-			/Zc:inline
-			/Zc:forScope
-		)
-	endif()
 
 	add_test(NAME ${TARGET_NAME} COMMAND $<TARGET_FILE:${TARGET_NAME}>)
 endfunction()
diff --git a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
index adcb48e6de..bc8c8952bd 100644
--- a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
@@ -90,16 +90,12 @@ class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework
 public:
 	using base_t::base_t;
 
-	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&& system) override
+	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override
 	{
-		static_cast<void>(system);
-
 		if (!isAPILoaded())
 			return false;
 
 		static_assert(std::is_same_v<decltype(std::declval<const nbl::video::CCUDADevice&>().getInternalObject()), CUdevice>);
-		CUdeviceptr devicePtr = 0;
-		static_cast<void>(devicePtr);
 
 		auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr);
 		if (!handler)
diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
index c39ba076d4..4f6cbebfb1 100644
--- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
@@ -3,11 +3,11 @@
 #include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
-#error "Default Nabla consumers must not get the CUDA opt-in define."
+#error "Nabla consumers must not get the CUDA opt-in define."
 #endif
 
 #ifdef CUDA_VERSION
-#error "Default Nabla consumers must not include CUDA SDK headers."
+#error "Nabla consumers must not include CUDA SDK headers."
 #endif
 
 namespace
@@ -20,9 +20,8 @@ class CUDAInteropPublicBoundarySmoke final : public nbl::system::IApplicationFra
 public:
 	using base_t::base_t;
 
-	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&& system) override
+	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override
 	{
-		static_cast<void>(system);
 		return isAPILoaded();
 	}
 

From f701ac63e83bea4bf743af80a6fe29af81d002c0 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 13:28:07 +0200
Subject: [PATCH 05/51] Export CUDA interop package target

---
 cmake/NablaConfig.cmake.in                   | 33 ++++++++++++++++++++
 cmake/common.cmake                           | 19 +++++++++--
 src/nbl/CMakeLists.txt                       | 19 ++++++++++-
 src/nbl/ext/CUDAInterop/CMakeLists.txt       |  5 ++-
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt |  2 +-
 5 files changed, 72 insertions(+), 6 deletions(-)

diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in
index b22b3ad0d7..e88a25b0dd 100644
--- a/cmake/NablaConfig.cmake.in
+++ b/cmake/NablaConfig.cmake.in
@@ -6,6 +6,7 @@ set(Nabla_DXC_GIT_INFO_JSON_FILE "${PACKAGE_PREFIX_DIR}/include/dxc_git_info.jso
 
 set(_NBL_NABLA_LOAD_CORE OFF)
 set(_NBL_NABLA_LOAD_NSC OFF)
+set(_NBL_NABLA_LOAD_CUDA_INTEROP OFF)
 set(_NBL_NABLA_COMPONENTS ${Nabla_FIND_COMPONENTS})
 set(_NBL_NABLA_HAS_CORE_EXPORTS OFF)
 set(_NBL_NABLA_HAS_NSC_EXPORTS OFF)
@@ -25,6 +26,10 @@ if(_NBL_NABLA_COMPONENTS)
     elseif(_NBL_NABLA_COMPONENT STREQUAL "Core")
       set(_NBL_NABLA_LOAD_CORE ON)
       set(Nabla_Core_FOUND TRUE)
+    elseif(_NBL_NABLA_COMPONENT STREQUAL "CUDAInterop")
+      set(_NBL_NABLA_LOAD_CORE ON)
+      set(_NBL_NABLA_LOAD_CUDA_INTEROP ON)
+      set(Nabla_CUDAInterop_FOUND TRUE)
     else()
       set("Nabla_${_NBL_NABLA_COMPONENT}_FOUND" FALSE)
     endif()
@@ -80,6 +85,34 @@ if(_NBL_NABLA_LOAD_NSC)
   endif()
 endif()
 
+if(_NBL_NABLA_LOAD_CUDA_INTEROP)
+  include(CMakeFindDependencyMacro)
+
+  if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "")
+    set(CUDAToolkit_ROOT "${Nabla_CUDA_TOOLKIT_ROOT}")
+  endif()
+
+  find_dependency(CUDAToolkit REQUIRED)
+  if(CUDAToolkit_VERSION VERSION_LESS "13.0")
+    set(Nabla_CUDAInterop_FOUND FALSE)
+    if(Nabla_FIND_REQUIRED_CUDAInterop)
+      message(FATAL_ERROR "Nabla: CUDAInterop requires CUDA Toolkit 13.0 or newer. Set Nabla_CUDA_TOOLKIT_ROOT or CUDAToolkit_ROOT if multiple CUDA Toolkit installs are present.")
+    endif()
+  else()
+    _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
+    if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
+      set(_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
+      foreach(_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR IN LISTS CUDAToolkit_INCLUDE_DIRS)
+        if(EXISTS "${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR}/cccl")
+          list(APPEND _NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS "${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR}/cccl")
+        endif()
+      endforeach()
+      list(REMOVE_DUPLICATES _NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS)
+      target_include_directories(Nabla::ext::CUDAInterop INTERFACE ${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS})
+    endif()
+  endif()
+endif()
+
 check_required_components(Nabla)
 
 #
diff --git a/cmake/common.cmake b/cmake/common.cmake
index c50e1f6fb2..ae2264fda4 100755
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -284,9 +284,22 @@ function(nbl_install_dir _DIR)
 endfunction()
 
 function(nbl_install_lib_spec _TARGETS _RELATIVE_DESTINATION)
-	install(TARGETS ${_TARGETS} ARCHIVE DESTINATION lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Release COMPONENT Libraries)
-	install(TARGETS ${_TARGETS} ARCHIVE DESTINATION debug/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Debug COMPONENT Libraries)
-	install(TARGETS ${_TARGETS} ARCHIVE DESTINATION relwithdebinfo/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS RelWithDebInfo COMPONENT Libraries)
+	cmake_parse_arguments(_NBL_INSTALL_LIB "" "EXPORT" "" ${ARGN})
+	if(_NBL_INSTALL_LIB_UNPARSED_ARGUMENTS)
+		message(FATAL_ERROR "Unexpected arguments for nbl_install_lib_spec: ${_NBL_INSTALL_LIB_UNPARSED_ARGUMENTS}")
+	endif()
+
+	if(_NBL_INSTALL_LIB_EXPORT)
+		install(TARGETS ${_TARGETS}
+			EXPORT ${_NBL_INSTALL_LIB_EXPORT}
+			ARCHIVE DESTINATION ${_NBL_CPACK_PACKAGE_RELATIVE_ENTRY_}/lib/${_RELATIVE_DESTINATION}
+			COMPONENT Libraries
+		)
+	else()
+		install(TARGETS ${_TARGETS} ARCHIVE DESTINATION lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Release COMPONENT Libraries)
+		install(TARGETS ${_TARGETS} ARCHIVE DESTINATION debug/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Debug COMPONENT Libraries)
+		install(TARGETS ${_TARGETS} ARCHIVE DESTINATION relwithdebinfo/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS RelWithDebInfo COMPONENT Libraries)
+	endif()
 endfunction()
 
 function(nbl_install_lib _TARGETS)
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index bb96bdfc80..6c3ab2606d 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -778,11 +778,28 @@ if(NBL_PCH)
 	)
 endif()
 
-# extensions
 start_tracking_variables_for_propagation_to_parent()
 add_subdirectory(ext EXCLUDE_FROM_ALL)
 propagate_changed_variables_to_parent_scope()
 
+if(DEFINED NBL_EXT_CUDA_INTEROP_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_LIB})
+	set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXCLUDE_FROM_ALL OFF)
+
+	set(_NBL_EXT_CUDA_INTEROP_INSTALL_ARGS)
+	if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD)
+		list(APPEND _NBL_EXT_CUDA_INTEROP_INSTALL_ARGS EXPORT NablaCUDAInteropExportTargets)
+	endif()
+	nbl_install_lib_spec(${NBL_EXT_CUDA_INTEROP_LIB} "nbl/ext/CUDA_INTEROP" ${_NBL_EXT_CUDA_INTEROP_INSTALL_ARGS})
+
+	if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD)
+		install(EXPORT NablaCUDAInteropExportTargets
+			NAMESPACE Nabla::
+			DESTINATION cmake
+			COMPONENT Libraries
+		)
+	endif()
+endif()
+
 if(TARGET ${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB})
 	set_target_properties(${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB} PROPERTIES EXCLUDE_FROM_ALL OFF)
 	nbl_install_lib_spec(${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB} "nbl/ext/FULL_SCREEN_TRIANGLE")
diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt
index d3f8e85169..93b6bef8c1 100644
--- a/src/nbl/ext/CUDAInterop/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt
@@ -39,7 +39,10 @@ if (NBL_COMPILE_WITH_CUDA)
 	list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
 	list(REMOVE_DUPLICATES NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
 
-	target_include_directories(${LIB_NAME} BEFORE PUBLIC ${NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS})
+	foreach(_NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIR IN LISTS NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
+		target_include_directories(${LIB_NAME} BEFORE PUBLIC $<BUILD_INTERFACE:${_NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIR}>)
+	endforeach()
+	set_target_properties(${LIB_NAME} PROPERTIES EXPORT_NAME "ext::CUDAInterop")
 	add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME})
 endif()
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index 23dd6d5422..cd9ba7b70e 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.30)
 project(NblExtCUDAInteropSmoke CXX)
 
 if(NOT TARGET Nabla::Nabla)
-	find_package(Nabla REQUIRED CONFIG)
+	find_package(Nabla REQUIRED CONFIG COMPONENTS Core CUDAInterop)
 endif()
 
 enable_testing()

From a520d57a443c421d41e5f72c14cec70d29d6f175 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 13:42:37 +0200
Subject: [PATCH 06/51] Use CUDAToolkit package targets

---
 cmake/NablaConfig.cmake.in             |  9 +--------
 src/nbl/ext/CUDAInterop/CMakeLists.txt | 14 +-------------
 2 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in
index e88a25b0dd..ca32518244 100644
--- a/cmake/NablaConfig.cmake.in
+++ b/cmake/NablaConfig.cmake.in
@@ -101,14 +101,7 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP)
   else()
     _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
     if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
-      set(_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
-      foreach(_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR IN LISTS CUDAToolkit_INCLUDE_DIRS)
-        if(EXISTS "${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR}/cccl")
-          list(APPEND _NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS "${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR}/cccl")
-        endif()
-      endforeach()
-      list(REMOVE_DUPLICATES _NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS)
-      target_include_directories(Nabla::ext::CUDAInterop INTERFACE ${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS})
+      target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit)
     endif()
   endif()
 endif()
diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt
index 93b6bef8c1..7a69e62ad4 100644
--- a/src/nbl/ext/CUDAInterop/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt
@@ -29,19 +29,7 @@ if (NBL_COMPILE_WITH_CUDA)
 		"_NBL_COMPILE_WITH_CUDA_"
 	)
 
-	set(NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
-	if(CUDAToolkit_ROOT)
-		list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS
-			"${CUDAToolkit_ROOT}/include"
-			"${CUDAToolkit_ROOT}/include/cccl"
-		)
-	endif()
-	list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})
-	list(REMOVE_DUPLICATES NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
-
-	foreach(_NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIR IN LISTS NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS)
-		target_include_directories(${LIB_NAME} BEFORE PUBLIC $<BUILD_INTERFACE:${_NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIR}>)
-	endforeach()
+	target_link_libraries(${LIB_NAME} PUBLIC $<BUILD_INTERFACE:CUDA::toolkit>)
 	set_target_properties(${LIB_NAME} PROPERTIES EXPORT_NAME "ext::CUDAInterop")
 	add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME})
 endif()

From 4bddc571ade70a289036d87772a85b35870c5307 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 14:29:27 +0200
Subject: [PATCH 07/51] Require CUDA version via CMake

---
 CMakeLists.txt                                 |  8 ++------
 cmake/NablaConfig.cmake.in                     | 15 ++++-----------
 .../ext/CUDAInterop/smoke/public_boundary.cpp  | 18 ++++++++++++++++++
 3 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5e1bfac20..14845789fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,12 +77,8 @@ if(NBL_COMPILE_WITH_CUDA)
 	if(NBL_CUDA_TOOLKIT_ROOT)
 		set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}")
 	endif()
-	find_package(CUDAToolkit REQUIRED)
-	if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0")
-		message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!")
-	else()
-		message(FATAL_ERROR "CUDA version 13.0+ needed for C++14 support!")
-	endif()
+	find_package(CUDAToolkit 13.0 REQUIRED)
+	message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!")
 endif()
 
 get_filename_component(NBL_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in
index ca32518244..8b9f62e548 100644
--- a/cmake/NablaConfig.cmake.in
+++ b/cmake/NablaConfig.cmake.in
@@ -92,17 +92,10 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP)
     set(CUDAToolkit_ROOT "${Nabla_CUDA_TOOLKIT_ROOT}")
   endif()
 
-  find_dependency(CUDAToolkit REQUIRED)
-  if(CUDAToolkit_VERSION VERSION_LESS "13.0")
-    set(Nabla_CUDAInterop_FOUND FALSE)
-    if(Nabla_FIND_REQUIRED_CUDAInterop)
-      message(FATAL_ERROR "Nabla: CUDAInterop requires CUDA Toolkit 13.0 or newer. Set Nabla_CUDA_TOOLKIT_ROOT or CUDAToolkit_ROOT if multiple CUDA Toolkit installs are present.")
-    endif()
-  else()
-    _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
-    if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
-      target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit)
-    endif()
+  find_dependency(CUDAToolkit 13.0 REQUIRED)
+  _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
+  if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
+    target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit)
   endif()
 endif()
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
index 4f6cbebfb1..eb7061f0ee 100644
--- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
@@ -1,5 +1,23 @@
 #include "nabla.h"
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
+#error "Nabla consumers must not get the CUDA opt-in define."
+#endif
+
+#ifdef CUDA_VERSION
+#error "Nabla consumers must not include CUDA SDK headers."
+#endif
+
 #include "nbl/system/IApplicationFramework.h"
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
+#error "Nabla consumers must not get the CUDA opt-in define."
+#endif
+
+#ifdef CUDA_VERSION
+#error "Nabla consumers must not include CUDA SDK headers."
+#endif
+
 #include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_

From 6f68e6644eb222cc5c6a875a8a85e97650261537 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 16:17:15 +0200
Subject: [PATCH 08/51] Split CUDA interop native surface

---
 cmake/NablaConfig.cmake.in                    |  17 +-
 examples_tests                                |   2 +-
 include/nbl/ext/CUDAInterop/CCUDADevice.h     |  43 +--
 .../ext/CUDAInterop/CCUDAExportableMemory.h   |  88 +++---
 include/nbl/ext/CUDAInterop/CCUDAHandler.h    | 274 ++----------------
 .../nbl/ext/CUDAInterop/CCUDAImportedMemory.h |  41 +--
 .../ext/CUDAInterop/CCUDAImportedSemaphore.h  |  47 ++-
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 211 ++++++++++++++
 src/nbl/CMakeLists.txt                        |  14 +
 src/nbl/ext/CMakeLists.txt                    |   4 +
 src/nbl/ext/CUDAInterop/CCUDADevice.cpp       |  85 ++++--
 .../ext/CUDAInterop/CCUDAExportableMemory.cpp |  34 ++-
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      | 161 ++++++++--
 .../ext/CUDAInterop/CCUDAImportedMemory.cpp   |  32 +-
 .../CUDAInterop/CCUDAImportedSemaphore.cpp    |  24 +-
 src/nbl/ext/CUDAInterop/CMakeLists.txt        |  17 +-
 .../CUDAInterop/CUDAInteropNativeState.hpp    | 106 +++++++
 src/nbl/ext/CUDAInterop/README.md             |  23 ++
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt  |  17 +-
 .../ext/CUDAInterop/smoke/clean_opt_in.cpp    |  42 +++
 .../smoke/{opt_in.cpp => native_opt_in.cpp}   |  25 +-
 21 files changed, 817 insertions(+), 490 deletions(-)
 create mode 100644 include/nbl/ext/CUDAInterop/CUDAInteropNative.h
 create mode 100644 src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
 create mode 100644 src/nbl/ext/CUDAInterop/README.md
 create mode 100644 src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
 rename src/nbl/ext/CUDAInterop/smoke/{opt_in.cpp => native_opt_in.cpp} (72%)

diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in
index 8b9f62e548..afff3dcccc 100644
--- a/cmake/NablaConfig.cmake.in
+++ b/cmake/NablaConfig.cmake.in
@@ -7,6 +7,7 @@ set(Nabla_DXC_GIT_INFO_JSON_FILE "${PACKAGE_PREFIX_DIR}/include/dxc_git_info.jso
 set(_NBL_NABLA_LOAD_CORE OFF)
 set(_NBL_NABLA_LOAD_NSC OFF)
 set(_NBL_NABLA_LOAD_CUDA_INTEROP OFF)
+set(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE OFF)
 set(_NBL_NABLA_COMPONENTS ${Nabla_FIND_COMPONENTS})
 set(_NBL_NABLA_HAS_CORE_EXPORTS OFF)
 set(_NBL_NABLA_HAS_NSC_EXPORTS OFF)
@@ -30,6 +31,12 @@ if(_NBL_NABLA_COMPONENTS)
       set(_NBL_NABLA_LOAD_CORE ON)
       set(_NBL_NABLA_LOAD_CUDA_INTEROP ON)
       set(Nabla_CUDAInterop_FOUND TRUE)
+    elseif(_NBL_NABLA_COMPONENT STREQUAL "CUDAInteropNative")
+      set(_NBL_NABLA_LOAD_CORE ON)
+      set(_NBL_NABLA_LOAD_CUDA_INTEROP ON)
+      set(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE ON)
+      set(Nabla_CUDAInterop_FOUND TRUE)
+      set(Nabla_CUDAInteropNative_FOUND TRUE)
     else()
       set("Nabla_${_NBL_NABLA_COMPONENT}_FOUND" FALSE)
     endif()
@@ -86,6 +93,10 @@ if(_NBL_NABLA_LOAD_NSC)
 endif()
 
 if(_NBL_NABLA_LOAD_CUDA_INTEROP)
+  _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
+endif()
+
+if(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE)
   include(CMakeFindDependencyMacro)
 
   if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "")
@@ -93,9 +104,9 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP)
   endif()
 
   find_dependency(CUDAToolkit 13.0 REQUIRED)
-  _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
-  if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
-    target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit)
+  _nbl_try_include_component("CUDAInteropNative" "NablaCUDAInteropNativeExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_NATIVE_FOUND)
+  if(_NBL_NABLA_CUDA_INTEROP_NATIVE_FOUND AND TARGET Nabla::ext::CUDAInteropNative)
+    target_link_libraries(Nabla::ext::CUDAInteropNative INTERFACE CUDA::toolkit)
   endif()
 endif()
 
diff --git a/examples_tests b/examples_tests
index 5c604d274b..7a2a4f604f 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 5c604d274b8aac99d8855f5b7aaf615910c8a5f6
+Subproject commit 7a2a4f604fd941984d6624e3059f7380cc6592a2
diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h
index d7886a4c53..25c40e7ed6 100644
--- a/include/nbl/ext/CUDAInterop/CCUDADevice.h
+++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h
@@ -4,37 +4,32 @@
 #ifndef _NBL_VIDEO_C_CUDA_DEVICE_H_
 #define _NBL_VIDEO_C_CUDA_DEVICE_H_
 
-
-#ifdef _NBL_COMPILE_WITH_CUDA_
-
 #include "nbl/video/declarations.h"
 #include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
 #include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
 #include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
 
-#include "cuda.h"
-#include "nvrtc.h"
-#if CUDA_VERSION < 9000
-	#error "Need CUDA 9.0 SDK or higher."
-#endif
-
-// useful includes in the future
-//#include "cudaEGL.h"
-//#include "cudaVDPAU.h"
+#include <cstring>
+#include <memory>
+#include <vector>
 
 namespace nbl::video
 {
 class CCUDAHandler;
 
+namespace cuda_native
+{
+struct SAccess;
+}
+
 class CCUDADevice : public core::IReferenceCounted
 {
-  public:
+	public:
+		struct SNativeState;
 #ifdef _WIN32
 		static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32;
-		static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32;
 #else
 		static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_FD;
-		static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 #endif
 
 		enum E_VIRTUAL_ARCHITECTURE
@@ -73,22 +68,20 @@ class CCUDADevice : public core::IReferenceCounted
 		};
 		inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;}
 
-		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, CUdevice device, core::smart_refctd_ptr<CCUDAHandler>&& handler);
+		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr<SNativeState>&& nativeState, core::smart_refctd_ptr<CCUDAHandler>&& handler);
 
-		~CCUDADevice();
+		~CCUDADevice() override;
 
 		inline core::SRange<const char* const> geDefaultCompileOptions() const
 		{
 			return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()};
 		}
 
-		CUdevice getInternalObject() const { return m_handle; }
-
 		const CCUDAHandler* getHandler() const { return m_handler.get();  }
 
 		bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); }
 
-		size_t roundToGranularity(CUmemLocationType location, size_t size) const;
+		size_t roundToGranularity(ECUDAMemoryLocation location, size_t size) const;
 
 		core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams);
 
@@ -97,24 +90,20 @@ class CCUDADevice : public core::IReferenceCounted
 		core::smart_refctd_ptr<CCUDAImportedSemaphore> importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sem);
 
 	private:
-		CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const;
+		friend struct cuda_native::SAccess;
 
 		static constexpr auto CudaMemoryLocationCount = 5;
 
-    const system::logger_opt_ptr m_logger;
+		const system::logger_opt_ptr m_logger;
 		std::vector<const char*> m_defaultCompileOptions;
 		core::smart_refctd_ptr<CVulkanConnection> m_vulkanConnection;
 		IPhysicalDevice* const m_physicalDevice;
 		E_VIRTUAL_ARCHITECTURE m_virtualArchitecture;
 
 		core::smart_refctd_ptr<CCUDAHandler> m_handler;
-		CUdevice m_handle;
-		CUcontext m_context;
-		std::array<size_t, CudaMemoryLocationCount> m_allocationGranularity;
+		std::unique_ptr<SNativeState> m_native;
 };
 
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
-
 #endif
diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
index 10bf911717..5973c31fac 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
@@ -4,64 +4,60 @@
 #ifndef _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_
 #define _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_
 
-
-#ifdef _NBL_COMPILE_WITH_CUDA_
-
 #include "nbl/video/declarations.h"
 
-#include "cuda.h"
-#include "nvrtc.h"
-#if CUDA_VERSION < 9000
-  #error "Need CUDA 9.0 SDK or higher."
-#endif
-
-// useful includes in the future
-//#include "cudaEGL.h"
-//#include "cudaVDPAU.h"
+#include <memory>
+#include <utility>
 
 namespace nbl::video
 {
-
 class CCUDADevice;
 
-class CCUDAExportableMemory : public core::IReferenceCounted
+namespace cuda_native
 {
-    public:
-
-        struct SCreationParams
-        {
-            size_t            size;
-            uint32_t          alignment;
-            CUmemLocationType location;
-        };
-
-        struct SCachedCreationParams : SCreationParams
-        {
-            size_t granularSize;
-            CUdeviceptr ptr;
-            external_handle_t externalHandle;
-        };
-
-        CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params)
-            : m_device(std::move(device))
-            , m_params(std::move(params))
-        {}
-        ~CCUDAExportableMemory() override;
-
-        CUdeviceptr getDeviceptr() const { return m_params.ptr;  }
-
-        const SCreationParams& getCreationParams() const { return m_params; }
-
-        core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
+struct SAccess;
+}
 
-    private:
+enum class ECUDAMemoryLocation : uint32_t
+{
+	DEVICE = 1,
+	HOST = 2,
+	HOST_NUMA = 3,
+	HOST_NUMA_CURRENT = 4
+};
 
-        core::smart_refctd_ptr<CCUDADevice> m_device;
-        SCachedCreationParams m_params;
+class CCUDAExportableMemory : public core::IReferenceCounted
+{
+	public:
+		struct SNativeState;
+		struct SCreationParams
+		{
+			size_t size;
+			uint32_t alignment;
+			ECUDAMemoryLocation location;
+		};
+
+		struct SCachedCreationParams : SCreationParams
+		{
+			size_t granularSize;
+			external_handle_t externalHandle;
+		};
+
+		CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState);
+		~CCUDAExportableMemory() override;
+
+		const SCreationParams& getCreationParams() const { return m_params; }
+
+		core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
+
+	private:
+		friend struct cuda_native::SAccess;
+
+		core::smart_refctd_ptr<CCUDADevice> m_device;
+		SCachedCreationParams m_params;
+		std::unique_ptr<SNativeState> m_native;
 };
 
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
-
 #endif
diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
index 5128aad575..063598a518 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
@@ -9,158 +9,30 @@
 
 #include "nbl/system/declarations.h"
 
-#include "nbl/ext/CUDAInterop/CCUDADevice.h"
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <string>
 
-
-#ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
 {
+class CCUDADevice;
+class CVulkanConnection;
+class IPhysicalDevice;
 
+namespace cuda_native
+{
+struct SAccess;
+}
 
 class CCUDAHandler : public core::IReferenceCounted
 {
-		public:
-		static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
-
-		inline bool defaultHandleResult(CUresult result) const
-		{
-			core::smart_refctd_ptr<system::ILogger> logger = m_logger.get();
-			return defaultHandleResult(result,logger.get());
-		}
-
-		//
-		bool defaultHandleResult(nvrtcResult result);
-
-		//
-		template<typename T>
-		static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
-
-		//
+	public:
+		struct SNativeState;
 		static core::smart_refctd_ptr<CCUDAHandler> create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger);
 
-		//
-		using LibLoader = system::DefaultFuncPtrLoader;
-		NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader
-			,cuCtxCreate_v4
-			,cuDevicePrimaryCtxRetain
-			,cuDevicePrimaryCtxRelease
-			,cuDevicePrimaryCtxSetFlags
-			,cuDevicePrimaryCtxGetState
-			,cuCtxDestroy_v2
-			,cuCtxEnablePeerAccess
-			,cuCtxGetApiVersion
-			,cuCtxGetCurrent
-			,cuCtxGetDevice
-			,cuCtxGetSharedMemConfig
-			,cuCtxPopCurrent_v2
-			,cuCtxPushCurrent_v2
-			,cuCtxSetCacheConfig
-			,cuCtxSetCurrent
-			,cuCtxSetSharedMemConfig
-			,cuCtxSynchronize
-			,cuDeviceComputeCapability
-			,cuDeviceCanAccessPeer
-			,cuDeviceGetCount
-			,cuDeviceGet
-			,cuDeviceGetAttribute
-			,cuDeviceGetLuid
-			,cuDeviceGetUuid_v2
-			,cuDeviceTotalMem_v2
-			,cuDeviceGetName
-			,cuDriverGetVersion
-			,cuEventCreate
-			,cuEventDestroy_v2
-			,cuEventElapsedTime
-			,cuEventQuery
-			,cuEventRecord
-			,cuEventSynchronize
-			,cuFuncGetAttribute
-			,cuFuncSetCacheConfig
-			,cuGetErrorName
-			,cuGetErrorString
-			,cuGraphicsMapResources
-			,cuGraphicsResourceGetMappedPointer_v2
-			,cuGraphicsResourceGetMappedMipmappedArray
-			,cuGraphicsSubResourceGetMappedArray
-			,cuGraphicsUnmapResources
-			,cuGraphicsUnregisterResource
-			,cuInit
-			,cuLaunchKernel
-			,cuMemAlloc_v2
-			,cuMemcpyDtoD_v2
-			,cuMemcpyDtoH_v2
-			,cuMemcpyHtoD_v2
-			,cuMemcpyDtoDAsync_v2
-			,cuMemcpyDtoHAsync_v2
-			,cuMemcpyHtoDAsync_v2
-			,cuMemGetAddressRange_v2
-			,cuMemFree_v2
-			,cuMemFreeHost
-			,cuMemGetInfo_v2
-			,cuMemHostAlloc
-			,cuMemHostRegister_v2
-			,cuMemHostUnregister
-			,cuMemsetD32_v2
-			,cuMemsetD32Async
-			,cuMemsetD8_v2
-			,cuMemsetD8Async
-			,cuModuleGetFunction
-			,cuModuleGetGlobal_v2
-			,cuModuleLoadDataEx
-			,cuModuleLoadFatBinary
-			,cuModuleUnload
-			,cuOccupancyMaxActiveBlocksPerMultiprocessor
-			,cuPointerGetAttribute
-			,cuStreamAddCallback
-			,cuStreamCreate
-			,cuStreamDestroy_v2
-			,cuStreamQuery
-			,cuStreamSynchronize
-			,cuStreamWaitEvent
-			,cuSurfObjectCreate
-			,cuSurfObjectDestroy
-			,cuTexObjectCreate
-			,cuTexObjectDestroy
-			,cuImportExternalMemory
-			,cuDestroyExternalMemory
-			,cuExternalMemoryGetMappedBuffer
-			,cuMemUnmap
-			,cuMemAddressFree
-			,cuMemGetAllocationGranularity
-			,cuMemAddressReserve
-			,cuMemCreate
-			,cuMemExportToShareableHandle
-			,cuMemMap
-			,cuMemRelease
-			,cuMemSetAccess
-			,cuMemImportFromShareableHandle
-			,cuLaunchHostFunc
-			,cuDestroyExternalSemaphore
-			,cuImportExternalSemaphore
-			,cuSignalExternalSemaphoresAsync
-			,cuWaitExternalSemaphoresAsync
-			,cuLogsRegisterCallback
-		);
-		const CUDA& getCUDAFunctionTable() const {return m_cuda;}
-
-		NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader,
-			nvrtcGetErrorString,
-			nvrtcVersion,
-			nvrtcAddNameExpression,
-			nvrtcCompileProgram,
-			nvrtcCreateProgram,
-			nvrtcDestroyProgram,
-			nvrtcGetLoweredName,
-			nvrtcGetPTX,
-			nvrtcGetPTXSize,
-			nvrtcGetProgramLog,
-			nvrtcGetProgramLogSize
-		);
-		const NVRTC& getNVRTCFunctionTable() const {return m_nvrtc;}
+		CCUDAHandler(std::unique_ptr<SNativeState>&& nativeState, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger, int _version);
 
-		CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger, int _version);
-
-		//
 		inline core::SRange<system::IFile* const> getSTDHeaders()
 		{
 			auto begin = m_headers.empty() ? nullptr:(&m_headers[0].get());
@@ -169,29 +41,9 @@ class CCUDAHandler : public core::IReferenceCounted
 		inline const auto& getSTDHeaderContents() { return m_headerContents; }
 		inline const auto& getSTDHeaderNames() { return m_headerNames; }
 
-		//
-		nvrtcResult createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
-		inline nvrtcResult createProgram(nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-		{
-			return createProgram(prog,std::string(source),name,headerCount,headerContents,includeNames);
-		}
-		inline nvrtcResult createProgram(nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-		{
-			const auto filesize = file->getSize();
-			std::string source(filesize+1u,'0');
-
-			system::IFile::success_t bytesRead;
-			file->read(bytesRead,source.data(),0u,file->getSize());
-			source.resize(bytesRead.getBytesProcessed());
-
-			return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames);
-		}
-
 		struct SCUDADeviceInfo
 		{
-			CUdevice handle = {};
-			CUuuid uuid = {};
-			int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
+			std::array<uint8_t,16> uuid = {};
 		};
 
 		inline core::vector<SCUDADeviceInfo> const& getAvailableDevices() const
@@ -199,93 +51,15 @@ class CCUDAHandler : public core::IReferenceCounted
 			return m_availableDevices;
 		}
 
-		//
-		inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange<const char* const> options)
-		{
-			return m_nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin());
-		}
-
-		//
-		nvrtcResult getProgramLog(nvrtcProgram prog, std::string& log);
-
-		//
-		struct ptx_and_nvrtcResult_t
-		{
-			core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
-			nvrtcResult result;
-		};
-		ptx_and_nvrtcResult_t getPTX(nvrtcProgram prog);
-
-		//
-		inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-			std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
-		)
-		{
-			nvrtcProgram program = nullptr;
-			nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
-			auto cleanup = core::makeRAIIExiter([&]() -> void
-			{
-				if (result!=NVRTC_SUCCESS && program)
-					m_nvrtc.pnvrtcDestroyProgram(&program); // TODO: do we need to destroy the program if we successfully get PTX?
-			});
-
-			result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames);
-			return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log);
-		}
-		inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-			const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
-		)
-		{
-			return compileDirectlyToPTX(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-		}
-		inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-			system::IFile* file, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
-		)
-		{
-			nvrtcProgram program = nullptr;
-			nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
-			auto cleanup = core::makeRAIIExiter([&]() -> void
-			{
-				if (result!=NVRTC_SUCCESS && program)
-					m_nvrtc.pnvrtcDestroyProgram(&program); // TODO: do we need to destroy the program if we successfully get PTX?
-			});
-
-			result = createProgram(&program,file,headerCount,headerContents,includeNames);
-			return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log);
-		}
-
 		core::smart_refctd_ptr<CCUDADevice> createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice);
 
 	protected:
+		~CCUDAHandler() override;
 
-		~CCUDAHandler() = default;
-		
-		//
-		inline ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
-		{
-			if (result!=NVRTC_SUCCESS)
-				return {nullptr,result};
-
-			result = compileProgram(program,nvrtcOptions);
-			if (log)
-				getProgramLog(program,*log);
-			if (result!=NVRTC_SUCCESS)
-				return {nullptr,result};
-			
-			return getPTX(program);
-		}
-
-		// function tables
-		CUDA m_cuda;
-		NVRTC m_nvrtc;
+	private:
+		friend struct cuda_native::SAccess;
 
-		//
+		std::unique_ptr<SNativeState> m_native;
 		core::vector<SCUDADeviceInfo> m_availableDevices;
 		core::vector<core::smart_refctd_ptr<system::IFile>> m_headers;
 		core::vector<const char*> m_headerContents;
@@ -295,16 +69,6 @@ class CCUDAHandler : public core::IReferenceCounted
 		int m_version;
 };
 
-#define ASSERT_CUDA_SUCCESS(expr, handler) \
-    do { \
-        const auto cudaResult = (expr); \
-        if (!((handler)->defaultHandleResult(cudaResult))) { \
-            assert(false); \
-        } \
-    } while(0)
-
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
-
 #endif
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
index 5f885abd2d..8a24f83907 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
@@ -1,46 +1,37 @@
 #ifndef _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_
 #define _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_
 
-#ifdef _NBL_COMPILE_WITH_CUDA_
-
 #include "nbl/video/declarations.h"
 
-#include "cuda.h"
-#include "nvrtc.h"
-#if CUDA_VERSION < 9000
-  #error "Need CUDA 9.0 SDK or higher."
-#endif
+#include <memory>
+#include <utility>
 
 namespace nbl::video
 {
 
 class CCUDADevice;
 
-class CCUDAImportedMemory : public core::IReferenceCounted
+namespace cuda_native
 {
-    public:
-
-      CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<nbl::video::IDeviceMemoryAllocation> src,
-        CUexternalMemory cuExtMem) : 
-        m_device(device),
-        m_src(src),
-        m_handle(cuExtMem) {}
-
-      ~CCUDAImportedMemory() override;
+struct SAccess;
+}
 
-      CUexternalMemory getInternalObject() const { return m_handle; }
-      CUresult getMappedBuffer(CUdeviceptr* mappedBuffer);
+class CCUDAImportedMemory : public core::IReferenceCounted
+{
+	public:
+		struct SNativeState;
+		CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<nbl::video::IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState);
 
-    private:
+		~CCUDAImportedMemory() override;
 
-      core::smart_refctd_ptr<CCUDADevice> m_device;
-      core::smart_refctd_ptr<IDeviceMemoryAllocation> m_src;
-      CUexternalMemory m_handle;
+	private:
+		friend struct cuda_native::SAccess;
 
+		core::smart_refctd_ptr<CCUDADevice> m_device;
+		core::smart_refctd_ptr<IDeviceMemoryAllocation> m_src;
+		std::unique_ptr<SNativeState> m_native;
 };
 
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
-
 #endif
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
index 409ef1a676..3ee03fb045 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
@@ -4,47 +4,36 @@
 #ifndef _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_
 #define _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_
 
-#ifdef _NBL_COMPILE_WITH_CUDA_
-
 #include "nbl/video/declarations.h"
 
-#include "cuda.h"
-#include "nvrtc.h"
-#if CUDA_VERSION < 9000
-  #error "Need CUDA 9.0 SDK or higher."
-#endif
-
-// useful includes in the future
-//#include "cudaEGL.h"
-//#include "cudaVDPAU.h"
+#include <memory>
+#include <utility>
 
 namespace nbl::video
 {
 
 class CCUDADevice;
 
+namespace cuda_native
+{
+struct SAccess;
+}
+
 class CCUDAImportedSemaphore : public core::IReferenceCounted
 {
-    public:
-
-      CUexternalSemaphore getInternalObject() const { return m_handle; }
-      CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, 
-        core::smart_refctd_ptr<ISemaphore> src, 
-        CUexternalSemaphore semaphore)
-          : m_device(std::move(device))
-          , m_src(std::move(src))
-          , m_handle(semaphore)
-      {}
-      ~CCUDAImportedSemaphore() override;
-
-    private:
-      core::smart_refctd_ptr<CCUDADevice> m_device;
-      core::smart_refctd_ptr<ISemaphore> m_src;
-      CUexternalSemaphore m_handle;
+	public:
+		struct SNativeState;
+		CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState);
+		~CCUDAImportedSemaphore() override;
+
+	private:
+		friend struct cuda_native::SAccess;
+
+		core::smart_refctd_ptr<CCUDADevice> m_device;
+		core::smart_refctd_ptr<ISemaphore> m_src;
+		std::unique_ptr<SNativeState> m_native;
 };
 
 }
 
-#endif // _NBL_COMPILE_WITH_CUDA_
-
 #endif
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
new file mode 100644
index 0000000000..f913664122
--- /dev/null
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -0,0 +1,211 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_
+#define _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_
+
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+
+#include "nbl/asset/ICPUBuffer.h"
+#include "nbl/system/DynamicFunctionCaller.h"
+
+#include "cuda.h"
+#include "nvrtc.h"
+#if CUDA_VERSION < 13000
+	#error "Need CUDA 13.0 SDK or higher."
+#endif
+
+namespace nbl::video::cuda_native
+{
+
+using LibLoader = system::DefaultFuncPtrLoader;
+
+NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader
+	,cuCtxCreate_v4
+	,cuDevicePrimaryCtxRetain
+	,cuDevicePrimaryCtxRelease
+	,cuDevicePrimaryCtxSetFlags
+	,cuDevicePrimaryCtxGetState
+	,cuCtxDestroy_v2
+	,cuCtxEnablePeerAccess
+	,cuCtxGetApiVersion
+	,cuCtxGetCurrent
+	,cuCtxGetDevice
+	,cuCtxGetSharedMemConfig
+	,cuCtxPopCurrent_v2
+	,cuCtxPushCurrent_v2
+	,cuCtxSetCacheConfig
+	,cuCtxSetCurrent
+	,cuCtxSetSharedMemConfig
+	,cuCtxSynchronize
+	,cuDeviceComputeCapability
+	,cuDeviceCanAccessPeer
+	,cuDeviceGetCount
+	,cuDeviceGet
+	,cuDeviceGetAttribute
+	,cuDeviceGetLuid
+	,cuDeviceGetUuid_v2
+	,cuDeviceTotalMem_v2
+	,cuDeviceGetName
+	,cuDriverGetVersion
+	,cuEventCreate
+	,cuEventDestroy_v2
+	,cuEventElapsedTime
+	,cuEventQuery
+	,cuEventRecord
+	,cuEventSynchronize
+	,cuFuncGetAttribute
+	,cuFuncSetCacheConfig
+	,cuGetErrorName
+	,cuGetErrorString
+	,cuGraphicsMapResources
+	,cuGraphicsResourceGetMappedPointer_v2
+	,cuGraphicsResourceGetMappedMipmappedArray
+	,cuGraphicsSubResourceGetMappedArray
+	,cuGraphicsUnmapResources
+	,cuGraphicsUnregisterResource
+	,cuInit
+	,cuLaunchKernel
+	,cuMemAlloc_v2
+	,cuMemcpyDtoD_v2
+	,cuMemcpyDtoH_v2
+	,cuMemcpyHtoD_v2
+	,cuMemcpyDtoDAsync_v2
+	,cuMemcpyDtoHAsync_v2
+	,cuMemcpyHtoDAsync_v2
+	,cuMemGetAddressRange_v2
+	,cuMemFree_v2
+	,cuMemFreeHost
+	,cuMemGetInfo_v2
+	,cuMemHostAlloc
+	,cuMemHostRegister_v2
+	,cuMemHostUnregister
+	,cuMemsetD32_v2
+	,cuMemsetD32Async
+	,cuMemsetD8_v2
+	,cuMemsetD8Async
+	,cuModuleGetFunction
+	,cuModuleGetGlobal_v2
+	,cuModuleLoadDataEx
+	,cuModuleLoadFatBinary
+	,cuModuleUnload
+	,cuOccupancyMaxActiveBlocksPerMultiprocessor
+	,cuPointerGetAttribute
+	,cuStreamAddCallback
+	,cuStreamCreate
+	,cuStreamDestroy_v2
+	,cuStreamQuery
+	,cuStreamSynchronize
+	,cuStreamWaitEvent
+	,cuSurfObjectCreate
+	,cuSurfObjectDestroy
+	,cuTexObjectCreate
+	,cuTexObjectDestroy
+	,cuImportExternalMemory
+	,cuDestroyExternalMemory
+	,cuExternalMemoryGetMappedBuffer
+	,cuMemUnmap
+	,cuMemAddressFree
+	,cuMemGetAllocationGranularity
+	,cuMemAddressReserve
+	,cuMemCreate
+	,cuMemExportToShareableHandle
+	,cuMemMap
+	,cuMemRelease
+	,cuMemSetAccess
+	,cuMemImportFromShareableHandle
+	,cuLaunchHostFunc
+	,cuDestroyExternalSemaphore
+	,cuImportExternalSemaphore
+	,cuSignalExternalSemaphoresAsync
+	,cuWaitExternalSemaphoresAsync
+	,cuLogsRegisterCallback
+);
+
+NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader,
+	nvrtcGetErrorString,
+	nvrtcVersion,
+	nvrtcAddNameExpression,
+	nvrtcCompileProgram,
+	nvrtcCreateProgram,
+	nvrtcDestroyProgram,
+	nvrtcGetLoweredName,
+	nvrtcGetPTX,
+	nvrtcGetPTXSize,
+	nvrtcGetProgramLog,
+	nvrtcGetProgramLogSize
+);
+
+struct SCUDADeviceInfo
+{
+	CUdevice handle = {};
+	CUuuid uuid = {};
+	int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
+};
+
+const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
+const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
+
+bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
+bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
+bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
+
+template<typename T>
+T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
+
+const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
+
+nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames);
+}
+nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
+nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
+
+struct ptx_and_nvrtcResult_t
+{
+	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
+	nvrtcResult result;
+};
+
+ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
+ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+);
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler& handler, const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(handler,std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
+ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+);
+
+CUdevice getInternalObject(const CCUDADevice& device);
+CUcontext getContext(const CCUDADevice& device);
+size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
+CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
+CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
+CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
+CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
+
+}
+
+#define ASSERT_CUDA_SUCCESS(expr, handler) \
+	do { \
+		const auto cudaResult = (expr); \
+		if (!nbl::video::cuda_native::defaultHandleResult(*(handler), cudaResult)) { \
+			assert(false); \
+		} \
+	} while(0)
+
+#endif
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 6c3ab2606d..ecf7f555c3 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -798,6 +798,20 @@ if(DEFINED NBL_EXT_CUDA_INTEROP_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_LIB})
 			COMPONENT Libraries
 		)
 	endif()
+
+	if(DEFINED NBL_EXT_CUDA_INTEROP_NATIVE_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB})
+		if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD)
+			install(TARGETS ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}
+				EXPORT NablaCUDAInteropNativeExportTargets
+				COMPONENT Libraries
+			)
+			install(EXPORT NablaCUDAInteropNativeExportTargets
+				NAMESPACE Nabla::
+				DESTINATION cmake
+				COMPONENT Libraries
+			)
+		endif()
+	endif()
 endif()
 
 if(TARGET ${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB})
diff --git a/src/nbl/ext/CMakeLists.txt b/src/nbl/ext/CMakeLists.txt
index 1f815413e8..59ae49285e 100644
--- a/src/nbl/ext/CMakeLists.txt
+++ b/src/nbl/ext/CMakeLists.txt
@@ -48,6 +48,10 @@ if (NBL_COMPILE_WITH_CUDA)
         ${NBL_EXT_CUDA_INTEROP_LIB}
         PARENT_SCOPE
     )
+    set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB
+        ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}
+        PARENT_SCOPE
+    )
 endif()
 
 if (NBL_BUILD_IMGUI)
diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
index aa06c6e7bf..5f59545173 100644
--- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
@@ -1,15 +1,12 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "nbl/ext/CUDAInterop/CCUDADevice.h"
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "CUDAInteropNativeState.hpp"
 
 #ifdef _WIN32
 #include <winternl.h>
 #endif
 
-#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
-
 #ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
 {
@@ -18,28 +15,27 @@ CCUDADevice::CCUDADevice(
 	core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, 
 	IPhysicalDevice* const vulkanDevice, 
 	const E_VIRTUAL_ARCHITECTURE virtualArchitecture,
-	CUdevice device,
+	std::unique_ptr<SNativeState>&& nativeState,
 	core::smart_refctd_ptr<CCUDAHandler>&& handler) : 
 	m_logger(vulkanDevice->getDebugCallback()->getLogger()),
   m_defaultCompileOptions(), 
   m_vulkanConnection(std::move(vulkanConnection)), 
   m_physicalDevice(vulkanDevice), 
   m_virtualArchitecture(virtualArchitecture),
-	m_handle(device),
 	m_handler(std::move(handler)),
-	m_allocationGranularity{}
+	m_native(std::move(nativeState))
 {
 	m_defaultCompileOptions.push_back("--std=c++14");
 	m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]);
 	m_defaultCompileOptions.push_back("-dc");
 	m_defaultCompileOptions.push_back("-use_fast_math");
 
-  const auto& cu = m_handler->getCUDAFunctionTable();
+  const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
 	
-	ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_context, nullptr, 0, m_handle), m_handler);
-	ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_context), m_handler);
+	ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle), m_handler);
+	ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_native->context), m_handler);
 
-	for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType)
+	for (uint32_t locationType = 0; locationType < m_native->allocationGranularity.size(); ++locationType)
 	{
 	
     #ifdef _WIN32
@@ -50,24 +46,47 @@ CCUDADevice::CCUDADevice(
 
 	  const auto prop = CUmemAllocationProp{
       .type = CU_MEM_ALLOCATION_TYPE_PINNED,
-      .requestedHandleTypes = ALLOCATION_HANDLE_TYPE,
-      .location = { .type = static_cast<CUmemLocationType>(locationType), .id = m_handle },
+      .requestedHandleTypes = cuda_native::getAllocationHandleType(),
+      .location = { .type = static_cast<CUmemLocationType>(locationType), .id = m_native->handle },
   #ifdef _WIN32
       .win32HandleMetaData = &metadata,
   #endif
     };
-		ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler);
+		ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler);
 	}
 }
 
-size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const
+size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation location, size_t size) const
+{
+	return cuda_native::roundToGranularity(*this,cuda_native::toNative(location),size);
+}
+
+namespace cuda_native
+{
+
+CUdevice getInternalObject(const CCUDADevice& device)
+{
+	return SAccess::native(device).handle;
+}
+
+CUcontext getContext(const CCUDADevice& device)
 {
-	return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location];
+	return SAccess::native(device).context;
+}
+
+size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size)
+{
+	const auto& granularity = SAccess::native(device).allocationGranularity[location];
+	return ((size - 1) / granularity + 1) * granularity;
+}
+
 }
 
-CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const
+static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory)
 {
-	const auto& cu = m_handler->getCUDAFunctionTable();
+	const auto handler = device.getHandler();
+	const auto& native = cuda_native::SAccess::native(device);
+	const auto& cu = cuda_native::getCUDAFunctionTable(*handler);
 	
 	CUdeviceptr ptr = 0;
 	if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err)
@@ -75,19 +94,19 @@ CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t siz
 
 	if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err)
 	{
-		ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler);
+		ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), handler);
 		return err;
 	}
 	
 	CUmemAccessDesc accessDesc = {
-		.location = { .type = location, .id = m_handle },
+		.location = { .type = location, .id = native.handle },
 		.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE,
 	};
 
 	if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err)
 	{
-		ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), m_handler);
-		ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler);
+		ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), handler);
+		ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), handler);
 		return err;
 	}
 
@@ -100,7 +119,8 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 {
 	CCUDAExportableMemory::SCachedCreationParams params = { inParams };
 
-	auto& cu = m_handler->getCUDAFunctionTable();
+	auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
+	const auto nativeLocation = cuda_native::toNative(params.location);
 	
 #ifdef _WIN32
 	OBJECT_ATTRIBUTES metadata = {
@@ -110,14 +130,15 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 
 	 const auto prop = CUmemAllocationProp{
 		.type = CU_MEM_ALLOCATION_TYPE_PINNED,
-		.requestedHandleTypes = ALLOCATION_HANDLE_TYPE,
-		.location = { .type = params.location, .id = m_handle },
+		.requestedHandleTypes = cuda_native::getAllocationHandleType(),
+		.location = { .type = nativeLocation, .id = m_native->handle },
 #ifdef _WIN32
 		.win32HandleMetaData = &metadata,
 #endif
 	};
 
 	params.granularSize = roundToGranularity(params.location, params.size);
+	auto nativeState = std::make_unique<CCUDAExportableMemory::SNativeState>();
 
 	CUmemGenericAllocationHandle mem;
 	if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err)
@@ -133,7 +154,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 		return nullptr;
 	}
 
-	if (const auto err = reserveAddressAndMapMemory(&params.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err)
+	if (const auto err = reserveAddressAndMapMemory(*this,&nativeState->ptr, params.granularSize, params.alignment, nativeLocation, mem); CUDA_SUCCESS != err)
 	{
 		m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
 
@@ -152,12 +173,12 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 		return nullptr;
 	}
 	
-	return core::make_smart_refctd_ptr<CCUDAExportableMemory>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(params));
+	return core::make_smart_refctd_ptr<CCUDAExportableMemory>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(params), std::move(nativeState));
 }
 
 core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem)
 {
-	const auto& cu = m_handler->getCUDAFunctionTable();
+	const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
 	const auto handleType = mem->getCreationParams().externalHandleType;
 
 	if (!handleType) return nullptr;
@@ -180,12 +201,12 @@ core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(co
 		m_logger.log("Fail to import external memory into CUDA!", system::ILogger::ELL_ERROR);
 		return nullptr;
 	}
-	return core::make_smart_refctd_ptr<CCUDAImportedMemory>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(mem), cuExtMem);
+	return core::make_smart_refctd_ptr<CCUDAImportedMemory>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(mem), std::make_unique<CCUDAImportedMemory::SNativeState>(cuExtMem));
 }
 
 core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sema)
 {
-	auto& cu = m_handler->getCUDAFunctionTable();
+	auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
 	auto handleType = sema->getCreationParams().externalHandleTypes.value;
 
 	if (!handleType)
@@ -210,12 +231,12 @@ core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaph
 		return nullptr;
 	}
 	
-	return core::make_smart_refctd_ptr<CCUDAImportedSemaphore>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(sema), cusema);
+	return core::make_smart_refctd_ptr<CCUDAImportedSemaphore>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(sema), std::make_unique<CCUDAImportedSemaphore::SNativeState>(cusema));
 }
 
 CCUDADevice::~CCUDADevice()
 {
-	ASSERT_CUDA_SUCCESS(m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context), m_handler);
+	ASSERT_CUDA_SUCCESS(cuda_native::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context), m_handler);
 }
 
 }
diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
index 65afdca660..94d18c40bb 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
@@ -2,14 +2,18 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
-#include "nbl/ext/CUDAInterop/CCUDADevice.h"
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "CUDAInteropNativeState.hpp"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
 {
 
+CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_params(std::move(params))
+	, m_native(std::move(nativeState))
+{}
+
 core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const
 {
 	auto pd = device->getPhysicalDevice();
@@ -18,10 +22,10 @@ core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsM
 
 	switch (m_params.location)
 	{
-    case CU_MEM_LOCATION_TYPE_DEVICE: memoryTypeBits &=  vram; break;
-    case CU_MEM_LOCATION_TYPE_HOST_NUMA:
-    case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT:
-    case CU_MEM_LOCATION_TYPE_HOST:   memoryTypeBits &= ~vram; break;
+    case ECUDAMemoryLocation::DEVICE: memoryTypeBits &=  vram; break;
+    case ECUDAMemoryLocation::HOST_NUMA:
+    case ECUDAMemoryLocation::HOST_NUMA_CURRENT:
+    case ECUDAMemoryLocation::HOST:   memoryTypeBits &= ~vram; break;
     default: break;
 	}
 
@@ -40,15 +44,25 @@ core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsM
 
 CCUDAExportableMemory::~CCUDAExportableMemory()
 {
-	const auto& cu = m_device->getHandler()->getCUDAFunctionTable();
+	const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
 
-  ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), m_device->getHandler());
+  ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_native->ptr, m_params.granularSize), m_device->getHandler());
 
-	ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize), m_device->getHandler());
+	ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize), m_device->getHandler());
 
   bool closeSucceed = CloseExternalHandle(m_params.externalHandle);
 	assert(closeSucceed);
 
+}
+
+namespace cuda_native
+{
+
+CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory)
+{
+	return SAccess::native(memory).ptr;
+}
+
 }
 }
 
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 748a88d1a1..49e36083d4 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "CUDAInteropNativeState.hpp"
 #include "nbl/system/CFileView.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
@@ -13,13 +13,11 @@ namespace nbl::video
 {
 	
 CCUDAHandler::CCUDAHandler(
-	CUDA&& _cuda, 
-	NVRTC&& _nvrtc, 
+	std::unique_ptr<SNativeState>&& nativeState,
 	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, 
 	core::smart_refctd_ptr<system::ILogger>&& _logger,
 	int _version)
-	: m_cuda(std::move(_cuda))
-	, m_nvrtc(std::move(_nvrtc))
+	: m_native(std::move(nativeState))
 	, m_headers(std::move(_headers))
 	, m_logger(std::move(_logger))
 	, m_version(_version)
@@ -32,29 +30,38 @@ CCUDAHandler::CCUDAHandler(
 	}
 
 	int deviceCount = 0;
-	if (m_cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0)
+	if (m_native->cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0)
 		return;
 
 	for (int device_i = 0; device_i < deviceCount; device_i++)
 	{
 		CUdevice handle = -1;
-		if (m_cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0)
+		if (m_native->cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0)
 			continue;
 
 		CUuuid uuid = {};
-		if (m_cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS)
+		if (m_native->cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS)
 			continue;
 
-		m_availableDevices.emplace_back(handle, uuid);
+		auto& nativeDevice = m_native->availableDevices.emplace_back();
+		nativeDevice.handle = handle;
+		nativeDevice.uuid = uuid;
+		auto& cleanDevice = m_availableDevices.emplace_back();
+		memcpy(cleanDevice.uuid.data(),&uuid,cleanDevice.uuid.size());
 
-		int* attributes = m_availableDevices.back().attributes;
+		int* attributes = nativeDevice.attributes;
 		for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++)
-			m_cuda.pcuDeviceGetAttribute(attributes + i, static_cast<CUdevice_attribute>(i), handle);
+			m_native->cuda.pcuDeviceGetAttribute(attributes + i, static_cast<CUdevice_attribute>(i), handle);
 
 	}
 }
 
-bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
+CCUDAHandler::~CCUDAHandler() = default;
+
+namespace cuda_native
+{
+
+bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
 {
 	switch (result)
 	{
@@ -420,7 +427,12 @@ bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt
 	return false;
 }
 
-bool CCUDAHandler::defaultHandleResult(nvrtcResult result)
+bool defaultHandleResult(const CCUDAHandler& handler, CUresult result)
+{
+	return defaultHandleResult(result,SAccess::logger(handler));
+}
+
+bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
 {
 	switch (result)
 	{
@@ -428,19 +440,21 @@ bool CCUDAHandler::defaultHandleResult(nvrtcResult result)
 			return true;
 			break;
 		default:
-			if (m_nvrtc.pnvrtcGetErrorString)
-				m_logger.log("%s\n",system::ILogger::ELL_ERROR,m_nvrtc.pnvrtcGetErrorString(result));
+			if (SAccess::native(handler).nvrtc.pnvrtcGetErrorString)
+				SAccess::logger(handler).log("%s\n",system::ILogger::ELL_ERROR,SAccess::native(handler).nvrtc.pnvrtcGetErrorString(result));
 			else
-				m_logger.log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR);
+				SAccess::logger(handler).log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR);
 			break;
 	}
 	_NBL_DEBUG_BREAK_IF(true);
 	return false;
 }
 
+}
+
 core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger)
 {
-	CUDA cuda = CUDA(
+	cuda_native::CUDA cuda = cuda_native::CUDA(
 		#if defined(_NBL_WINDOWS_API_)
 			"nvcuda"
 		#elif defined(_NBL_POSIX_API_)
@@ -450,7 +464,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 		#endif
 	);
 	
-	NVRTC nvrtc = {};
+	cuda_native::NVRTC nvrtc = {};
 	#if defined(_NBL_WINDOWS_API_)
 	// Perpetual TODO: any new CUDA releases we need to account for?
 	// Version List: https://developer.nvidia.com/cuda-toolkit-archive
@@ -468,7 +482,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 		{
 			std::string path(*verpath);
 			path += *suffix;
-			nvrtc = NVRTC(path.c_str());
+			nvrtc = cuda_native::NVRTC(path.c_str());
 			if (nvrtc.pnvrtcVersion)
 				break;
 		}
@@ -476,7 +490,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 			break;
 	}
 	#elif defined(_NBL_POSIX_API_)
-	nvrtc = NVRTC("nvrtc");
+	nvrtc = cuda_native::NVRTC("nvrtc");
 	//nvrtc_builtins = NVRTC("nvrtc-builtins");
 	#else
 	#error "Unsuported Platform"
@@ -526,10 +540,28 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 		));
 	}
 
-	return core::make_smart_refctd_ptr<CCUDAHandler>(std::move(cuda),std::move(nvrtc), std::move(headers), std::move(_logger), cudaVersion);
+	return core::make_smart_refctd_ptr<CCUDAHandler>(std::make_unique<SNativeState>(std::move(cuda),std::move(nvrtc)), std::move(headers), std::move(_logger), cudaVersion);
+}
+
+namespace cuda_native
+{
+
+const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler)
+{
+	return SAccess::native(handler).cuda;
+}
+
+const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler)
+{
+	return SAccess::native(handler).nvrtc;
 }
 
-nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
+const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler)
+{
+	return SAccess::native(handler).availableDevices;
+}
+
+nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
 {
 #if defined(_NBL_WINDOWS_API_)
 	source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n");
@@ -538,26 +570,43 @@ nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source
 #else
 #error "Unsuported Platform"
 #endif
-	return m_nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames);
+	return SAccess::native(handler).nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames);
+}
+
+nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount, const char* const* headerContents, const char* const* includeNames)
+{
+	const auto filesize = file->getSize();
+	std::string source(filesize+1u,'0');
+
+	system::IFile::success_t bytesRead;
+	file->read(bytesRead,source.data(),0u,file->getSize());
+	source.resize(bytesRead.getBytesProcessed());
+
+	return createProgram(handler,prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames);
+}
+
+nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options)
+{
+	return SAccess::native(handler).nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin());
 }
 
-nvrtcResult CCUDAHandler::getProgramLog(nvrtcProgram prog, std::string& log)
+nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log)
 {
 	size_t _size = 0ull;
-	nvrtcResult sizeRes = m_nvrtc.pnvrtcGetProgramLogSize(prog, &_size);
+	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetProgramLogSize(prog, &_size);
 	if (sizeRes != NVRTC_SUCCESS)
 		return sizeRes;
 	if (_size == 0ull)
 		return NVRTC_ERROR_INVALID_INPUT;
 
 	log.resize(_size);
-	return m_nvrtc.pnvrtcGetProgramLog(prog,log.data());
+	return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data());
 }
 
-CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog)
+ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
 {
 	size_t _size = 0ull;
-	nvrtcResult sizeRes = m_nvrtc.pnvrtcGetPTXSize(prog,&_size);
+	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size);
 	if (sizeRes!=NVRTC_SUCCESS)
 		return {nullptr,sizeRes};
 	if (_size==0ull)
@@ -567,7 +616,57 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog)
 	ptxParams.size = _size;
 	auto ptx = asset::ICPUBuffer::create(std::move(ptxParams));
 	auto ptxPtr = static_cast<char*>(ptx->getPointer());
-	return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
+	return {std::move(ptx),SAccess::native(handler).nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
+}
+
+static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
+{
+	if (result!=NVRTC_SUCCESS)
+		return {nullptr,result};
+
+	result = compileProgram(handler,program,nvrtcOptions);
+	if (log)
+		getProgramLog(handler,program,*log);
+	if (result!=NVRTC_SUCCESS)
+		return {nullptr,result};
+
+	return getPTX(handler,program);
+}
+
+ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount, const char* const* headerContents, const char* const* includeNames,
+	std::string* log)
+{
+	nvrtcProgram program = nullptr;
+	nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
+	auto cleanup = core::makeRAIIExiter([&]() -> void
+	{
+		if (result!=NVRTC_SUCCESS && program)
+			SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program);
+	});
+
+	result = createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames);
+	return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log);
+}
+
+ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount, const char* const* headerContents, const char* const* includeNames,
+	std::string* log)
+{
+	nvrtcProgram program = nullptr;
+	nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
+	auto cleanup = core::makeRAIIExiter([&]() -> void
+	{
+		if (result!=NVRTC_SUCCESS && program)
+			SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program);
+	});
+
+	result = createProgram(handler,&program,file,headerCount,headerContents,includeNames);
+	return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log);
+}
+
 }
 
 core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice)
@@ -578,7 +677,7 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 	if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end())
 		return nullptr;
 
-	for (const auto& device : m_availableDevices)
+	for (const auto& device : m_native->availableDevices)
 	{
 		if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE))
 		{
@@ -662,7 +761,7 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 			if (arch==CCUDADevice::EVA_COUNT)
 				continue;
 
-			return core::make_smart_refctd_ptr<CCUDADevice>(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr<CCUDAHandler>(this));
+			return core::make_smart_refctd_ptr<CCUDADevice>(std::move(vulkanConnection), physicalDevice, arch, std::make_unique<CCUDADevice::SNativeState>(device.handle), core::smart_refctd_ptr<CCUDAHandler>(this));
 		}
 	}
 	return nullptr;
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
index a785bad9b9..bbc65f91ab 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
@@ -2,30 +2,44 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
-#include "nbl/ext/CUDAInterop/CCUDADevice.h"
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "CUDAInteropNativeState.hpp"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 
 namespace nbl::video
 {
 
-CUresult CCUDAImportedMemory::getMappedBuffer(CUdeviceptr* mappedBuffer)
+CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<nbl::video::IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_src(std::move(src))
+	, m_native(std::move(nativeState))
+{}
+
+namespace cuda_native
+{
+
+CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory)
+{
+  return SAccess::native(memory).handle;
+}
+
+CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer)
 {
   CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {};
   bufferDesc.offset = 0;
-  bufferDesc.size = m_src->getAllocationSize();
+  bufferDesc.size = SAccess::source(memory)->getAllocationSize();
 
-  auto& cu = m_device->getHandler()->getCUDAFunctionTable();
-  return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, m_handle, &bufferDesc);
+  const auto& cu = getCUDAFunctionTable(*SAccess::device(memory)->getHandler());
+  return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, SAccess::native(memory).handle, &bufferDesc);
   
 }
 
+}
+
 CCUDAImportedMemory::~CCUDAImportedMemory()
 {
-  auto& cu = m_device->getHandler()->getCUDAFunctionTable();
-  ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_handle), m_device->getHandler());
+  auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
+  ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_native->handle), m_device->getHandler());
 }
 
 }
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
index 1ca4a34190..b6e3b319f7 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
@@ -2,17 +2,31 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
-#include "nbl/ext/CUDAInterop/CCUDADevice.h"
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "CUDAInteropNativeState.hpp"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
 {
+CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_src(std::move(src))
+	, m_native(std::move(nativeState))
+{}
+
+namespace cuda_native
+{
+
+CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore)
+{
+	return SAccess::native(semaphore).handle;
+}
+
+}
+
 CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
 {
-	auto& cu = m_device->getHandler()->getCUDAFunctionTable();
-	ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_handle), m_device->getHandler());
+	auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
+	ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_native->handle), m_device->getHandler());
 }
 }
 
diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt
index 7a69e62ad4..973fbb232a 100644
--- a/src/nbl/ext/CUDAInterop/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt
@@ -5,6 +5,7 @@ if (NBL_COMPILE_WITH_CUDA)
 
 	set(NBL_EXT_CUDA_INTEROP_H
 		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInterop.h
+		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInteropNative.h
 		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDADevice.h
 		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAExportableMemory.h
 		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAHandler.h
@@ -26,12 +27,24 @@ if (NBL_COMPILE_WITH_CUDA)
 		"${NBL_EXT_CUDA_INTEROP_SRC}"
 		""
 		""
-		"_NBL_COMPILE_WITH_CUDA_"
+		""
 	)
 
-	target_link_libraries(${LIB_NAME} PUBLIC $<BUILD_INTERFACE:CUDA::toolkit>)
+	target_compile_definitions(${LIB_NAME} PRIVATE _NBL_COMPILE_WITH_CUDA_)
+	target_include_directories(${LIB_NAME} PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
 	set_target_properties(${LIB_NAME} PROPERTIES EXPORT_NAME "ext::CUDAInterop")
 	add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME})
+
+	set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB "NblExtCUDA_INTEROP_NATIVE")
+	add_library(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} INTERFACE)
+	target_link_libraries(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} INTERFACE
+		$<BUILD_INTERFACE:${LIB_NAME}>
+		$<INSTALL_INTERFACE:Nabla::ext::CUDAInterop>
+		CUDA::toolkit
+	)
+	set_target_properties(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInteropNative")
+	add_library(Nabla::ext::CUDAInteropNative ALIAS ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB})
+	set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB "${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}" PARENT_SCOPE)
 endif()
 
 add_subdirectory(smoke)
diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
new file mode 100644
index 0000000000..2dc3c3bbca
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
@@ -0,0 +1,106 @@
+#ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
+#define _NBL_EXT_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
+
+#include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
+
+#include <array>
+
+namespace nbl::video
+{
+
+struct CCUDAHandler::SNativeState
+{
+	cuda_native::CUDA cuda;
+	cuda_native::NVRTC nvrtc;
+	core::vector<cuda_native::SCUDADeviceInfo> availableDevices;
+
+	SNativeState(cuda_native::CUDA&& _cuda, cuda_native::NVRTC&& _nvrtc)
+		: cuda(std::move(_cuda))
+		, nvrtc(std::move(_nvrtc))
+	{}
+};
+
+struct CCUDADevice::SNativeState
+{
+	CUdevice handle = {};
+	CUcontext context = nullptr;
+	std::array<size_t,5> allocationGranularity = {};
+
+	explicit SNativeState(CUdevice _handle)
+		: handle(_handle)
+	{}
+};
+
+struct CCUDAExportableMemory::SNativeState
+{
+	CUdeviceptr ptr = 0;
+};
+
+struct CCUDAImportedMemory::SNativeState
+{
+	CUexternalMemory handle = nullptr;
+
+	explicit SNativeState(CUexternalMemory _handle)
+		: handle(_handle)
+	{}
+};
+
+struct CCUDAImportedSemaphore::SNativeState
+{
+	CUexternalSemaphore handle = nullptr;
+
+	explicit SNativeState(CUexternalSemaphore _handle)
+		: handle(_handle)
+	{}
+};
+
+namespace cuda_native
+{
+
+inline CUmemLocationType toNative(ECUDAMemoryLocation location)
+{
+	return static_cast<CUmemLocationType>(static_cast<uint32_t>(location));
+}
+
+inline ECUDAMemoryLocation toNabla(CUmemLocationType location)
+{
+	return static_cast<ECUDAMemoryLocation>(static_cast<uint32_t>(location));
+}
+
+inline CUmemAllocationHandleType getAllocationHandleType()
+{
+#ifdef _WIN32
+	return CU_MEM_HANDLE_TYPE_WIN32;
+#else
+	return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+#endif
+}
+
+struct SAccess
+{
+	static CCUDAHandler::SNativeState& native(CCUDAHandler& handler) { return *handler.m_native; }
+	static const CCUDAHandler::SNativeState& native(const CCUDAHandler& handler) { return *handler.m_native; }
+
+	static CCUDADevice::SNativeState& native(CCUDADevice& device) { return *device.m_native; }
+	static const CCUDADevice::SNativeState& native(const CCUDADevice& device) { return *device.m_native; }
+
+	static CCUDAExportableMemory::SNativeState& native(CCUDAExportableMemory& memory) { return *memory.m_native; }
+	static const CCUDAExportableMemory::SNativeState& native(const CCUDAExportableMemory& memory) { return *memory.m_native; }
+
+	static CCUDAImportedMemory::SNativeState& native(CCUDAImportedMemory& memory) { return *memory.m_native; }
+	static const CCUDAImportedMemory::SNativeState& native(const CCUDAImportedMemory& memory) { return *memory.m_native; }
+
+	static CCUDAImportedSemaphore::SNativeState& native(CCUDAImportedSemaphore& semaphore) { return *semaphore.m_native; }
+	static const CCUDAImportedSemaphore::SNativeState& native(const CCUDAImportedSemaphore& semaphore) { return *semaphore.m_native; }
+
+	static system::logger_opt_ptr logger(const CCUDAHandler& handler) { return handler.m_logger.get().get(); }
+	static system::logger_opt_ptr logger(const CCUDADevice& device) { return device.m_logger; }
+	static const CCUDADevice* device(const CCUDAImportedMemory& memory) { return memory.m_device.get(); }
+	static IDeviceMemoryAllocation* source(const CCUDAImportedMemory& memory) { return memory.m_src.get(); }
+};
+
+}
+
+}
+
+#endif
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
new file mode 100644
index 0000000000..1fd88d1b04
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -0,0 +1,23 @@
+# CUDA Interop Targets
+
+- `Nabla::Nabla` stays CUDA-free. `find_package(Nabla CONFIG)` does not require the CUDA SDK.
+- `Nabla::ext::CUDAInterop` is the clean Nabla interop target. Its public headers do not include `cuda.h` or `nvrtc.h`, so consumers can use a CUDA-enabled Nabla package without installing the CUDA SDK.
+- `Nabla::ext::CUDAInteropNative` is the explicit raw CUDA opt-in target. It exposes `CUDAInteropNative.h`, CUDA Driver API and NVRTC types, and requires `CUDAToolkit`.
+- Consumers can request native CUDA with `find_package(Nabla CONFIG COMPONENTS Core CUDAInteropNative)` and override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>`.
+- A consumer can use a newer compatible local CUDA SDK through `CUDAInteropNative` without rebuilding Nabla or the clean `CUDAInterop` target.
+- Rebuilds stay local: changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
+
+```cmake
+find_package(Nabla CONFIG REQUIRED)
+target_link_libraries(app PRIVATE Nabla::Nabla)
+```
+
+```cmake
+find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInterop)
+target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop)
+```
+
+```cmake
+find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInteropNative)
+target_link_libraries(app PRIVATE Nabla::ext::CUDAInteropNative)
+```
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index cd9ba7b70e..71bdac260d 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -1,8 +1,14 @@
 cmake_minimum_required(VERSION 3.30)
 project(NblExtCUDAInteropSmoke CXX)
 
+option(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE "Build the CUDA native opt-in smoke from an installed Nabla package." OFF)
+
 if(NOT TARGET Nabla::Nabla)
-	find_package(Nabla REQUIRED CONFIG COMPONENTS Core CUDAInterop)
+	set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core CUDAInterop)
+	if(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE)
+		list(APPEND _NBL_CUDA_INTEROP_SMOKE_COMPONENTS CUDAInteropNative)
+	endif()
+	find_package(Nabla REQUIRED CONFIG COMPONENTS ${_NBL_CUDA_INTEROP_SMOKE_COMPONENTS})
 endif()
 
 enable_testing()
@@ -18,6 +24,11 @@ nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.
 target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla)
 
 if(TARGET Nabla::ext::CUDAInterop)
-	nbl_add_cuda_interop_smoke(NblExtCUDAInteropOptInSmoke opt_in.cpp)
-	target_link_libraries(NblExtCUDAInteropOptInSmoke PRIVATE Nabla::ext::CUDAInterop)
+	nbl_add_cuda_interop_smoke(NblExtCUDAInteropCleanOptInSmoke clean_opt_in.cpp)
+	target_link_libraries(NblExtCUDAInteropCleanOptInSmoke PRIVATE Nabla::ext::CUDAInterop)
+endif()
+
+if(TARGET Nabla::ext::CUDAInteropNative)
+	nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp)
+	target_link_libraries(NblExtCUDAInteropNativeOptInSmoke PRIVATE Nabla::ext::CUDAInteropNative)
 endif()
diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
new file mode 100644
index 0000000000..6952433f9e
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
@@ -0,0 +1,42 @@
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/system/IApplicationFramework.h"
+
+#include <type_traits>
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
+#error "Nabla::ext::CUDAInterop must not propagate the CUDA build define."
+#endif
+
+#ifdef CUDA_VERSION
+#error "Nabla::ext::CUDAInterop must not require CUDA SDK headers."
+#endif
+
+namespace
+{
+
+class CUDAInteropCleanOptInSmoke final : public nbl::system::IApplicationFramework
+{
+	using base_t = nbl::system::IApplicationFramework;
+
+public:
+	using base_t::base_t;
+
+	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override
+	{
+		static_assert(std::is_same_v<decltype(nbl::video::CCUDAExportableMemory::SCreationParams{}.location), nbl::video::ECUDAMemoryLocation>);
+
+		const nbl::video::CCUDAExportableMemory::SCreationParams params = {
+			.size = 4096,
+			.alignment = 4096,
+			.location = nbl::video::ECUDAMemoryLocation::DEVICE,
+		};
+		return isAPILoaded() && params.location==nbl::video::ECUDAMemoryLocation::DEVICE;
+	}
+
+	void workLoopBody() override {}
+	bool keepRunning() override { return false; }
+};
+
+}
+
+NBL_MAIN_FUNC(CUDAInteropCleanOptInSmoke)
diff --git a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
similarity index 72%
rename from src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
rename to src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index bc8c8952bd..d868b2eaa7 100644
--- a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -1,4 +1,4 @@
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
 #include "nbl/system/IApplicationFramework.h"
 
 #include <algorithm>
@@ -7,8 +7,8 @@
 #include <type_traits>
 #include <utility>
 
-#ifndef _NBL_COMPILE_WITH_CUDA_
-#error "CUDA interop consumers must opt in through Nabla::ext::CUDAInterop."
+#ifndef CUDA_VERSION
+#error "Nabla::ext::CUDAInteropNative must expose CUDA SDK headers."
 #endif
 
 namespace
@@ -25,7 +25,7 @@ using namespace nbl::video;
 	auto cudaMemory = cudaDevice.createExportableMemory({
 		.size = 4096,
 		.alignment = 4096,
-		.location = CU_MEM_LOCATION_TYPE_DEVICE,
+		.location = ECUDAMemoryLocation::DEVICE,
 	});
 	if (!cudaMemory)
 		return false;
@@ -36,15 +36,16 @@ using namespace nbl::video;
 
 	CUdeviceptr mappedVulkanMemory = 0;
 	if (importedFromVulkan)
-		importedFromVulkan->getMappedBuffer(&mappedVulkanMemory);
+		cuda_native::getMappedBuffer(*importedFromVulkan,&mappedVulkanMemory);
 
-	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? importedSemaphore->getInternalObject():nullptr;
-	return exportedToVulkan.get() && mappedVulkanMemory && cudaSemaphore;
+	const CUdeviceptr cudaDevicePtr = cuda_native::getDeviceptr(*cudaMemory);
+	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::getInternalObject(*importedSemaphore):nullptr;
+	return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore;
 }
 
 bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 {
-	auto& cuda = handler.getCUDAFunctionTable();
+	auto& cuda = cuda_native::getCUDAFunctionTable(handler);
 
 	CUcontext context = nullptr;
 	if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS)
@@ -83,7 +84,7 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 }
 }
 
-class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework
+class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramework
 {
 	using base_t = nbl::system::IApplicationFramework;
 
@@ -95,13 +96,13 @@ class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework
 		if (!isAPILoaded())
 			return false;
 
-		static_assert(std::is_same_v<decltype(std::declval<const nbl::video::CCUDADevice&>().getInternalObject()), CUdevice>);
+		static_assert(std::is_same_v<decltype(nbl::video::cuda_native::getInternalObject(std::declval<const nbl::video::CCUDADevice&>())), CUdevice>);
 
 		auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr);
 		if (!handler)
 			return true;
 
-		const auto& devices = handler->getAvailableDevices();
+		const auto& devices = nbl::video::cuda_native::getAvailableDevices(*handler);
 		if (devices.empty())
 			return true;
 
@@ -112,4 +113,4 @@ class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework
 	bool keepRunning() override { return false; }
 };
 
-NBL_MAIN_FUNC(CUDAInteropOptInSmoke)
+NBL_MAIN_FUNC(CUDAInteropNativeOptInSmoke)

From 49bcb2cf6c96e7fca42a16142c28ddc83686c579 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 16:40:53 +0200
Subject: [PATCH 09/51] Add native CUDA accessor overloads

---
 examples_tests                                |   2 +-
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 172 ++++++++++++++++++
 src/nbl/ext/CUDAInterop/README.md             |   1 +
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |   8 +-
 4 files changed, 178 insertions(+), 5 deletions(-)

diff --git a/examples_tests b/examples_tests
index 7a2a4f604f..dfa2b7ac39 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 7a2a4f604fd941984d6624e3059f7380cc6592a2
+Subproject commit dfa2b7ac39c6b9ae94ae2eb70c8f6ec251a9715e
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index f913664122..ea6313f26b 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -146,6 +146,26 @@ struct SCUDADeviceInfo
 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
 
+inline const CUDA& getCUDAFunctionTable(const CCUDAHandler* handler)
+{
+	return getCUDAFunctionTable(*handler);
+}
+
+inline const CUDA& getCUDAFunctionTable(const core::smart_refctd_ptr<CCUDAHandler>& handler)
+{
+	return getCUDAFunctionTable(*handler);
+}
+
+inline const NVRTC& getNVRTCFunctionTable(const CCUDAHandler* handler)
+{
+	return getNVRTCFunctionTable(*handler);
+}
+
+inline const NVRTC& getNVRTCFunctionTable(const core::smart_refctd_ptr<CCUDAHandler>& handler)
+{
+	return getNVRTCFunctionTable(*handler);
+}
+
 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
@@ -155,12 +175,46 @@ T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
 
 const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
 
+inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler* handler)
+{
+	return getAvailableDevices(*handler);
+}
+
+inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(const core::smart_refctd_ptr<CCUDAHandler>& handler)
+{
+	return getAvailableDevices(*handler);
+}
+
 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
 	return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames);
 }
 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames);
+}
+inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames);
+}
+inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(*handler,prog,source,name,headerCount,headerContents,includeNames);
+}
+inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(*handler,prog,source,name,headerCount,headerContents,includeNames);
+}
+inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(*handler,prog,file,headerCount,headerContents,includeNames);
+}
+inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+{
+	return createProgram(*handler,prog,file,headerCount,headerContents,includeNames);
+}
 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
 
@@ -189,6 +243,54 @@ ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
 );
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler* handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(*handler,std::move(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	const core::smart_refctd_ptr<CCUDAHandler>& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(*handler,std::move(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler* handler, const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(*handler,source,filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	const core::smart_refctd_ptr<CCUDAHandler>& handler, const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(*handler,source,filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	CCUDAHandler* handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
+inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
+	const core::smart_refctd_ptr<CCUDAHandler>& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+	std::string* log=nullptr
+)
+{
+	return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log);
+}
 
 CUdevice getInternalObject(const CCUDADevice& device);
 CUcontext getContext(const CCUDADevice& device);
@@ -198,6 +300,76 @@ CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
 
+inline CUdevice getInternalObject(const CCUDADevice* device)
+{
+	return getInternalObject(*device);
+}
+
+inline CUdevice getInternalObject(const core::smart_refctd_ptr<CCUDADevice>& device)
+{
+	return getInternalObject(*device);
+}
+
+inline CUcontext getContext(const CCUDADevice* device)
+{
+	return getContext(*device);
+}
+
+inline CUcontext getContext(const core::smart_refctd_ptr<CCUDADevice>& device)
+{
+	return getContext(*device);
+}
+
+inline size_t roundToGranularity(const CCUDADevice* device, CUmemLocationType location, size_t size)
+{
+	return roundToGranularity(*device,location,size);
+}
+
+inline size_t roundToGranularity(const core::smart_refctd_ptr<CCUDADevice>& device, CUmemLocationType location, size_t size)
+{
+	return roundToGranularity(*device,location,size);
+}
+
+inline CUdeviceptr getDeviceptr(const CCUDAExportableMemory* memory)
+{
+	return getDeviceptr(*memory);
+}
+
+inline CUdeviceptr getDeviceptr(const core::smart_refctd_ptr<CCUDAExportableMemory>& memory)
+{
+	return getDeviceptr(*memory);
+}
+
+inline CUexternalMemory getInternalObject(const CCUDAImportedMemory* memory)
+{
+	return getInternalObject(*memory);
+}
+
+inline CUexternalMemory getInternalObject(const core::smart_refctd_ptr<CCUDAImportedMemory>& memory)
+{
+	return getInternalObject(*memory);
+}
+
+inline CUresult getMappedBuffer(const CCUDAImportedMemory* memory, CUdeviceptr* mappedBuffer)
+{
+	return getMappedBuffer(*memory,mappedBuffer);
+}
+
+inline CUresult getMappedBuffer(const core::smart_refctd_ptr<CCUDAImportedMemory>& memory, CUdeviceptr* mappedBuffer)
+{
+	return getMappedBuffer(*memory,mappedBuffer);
+}
+
+inline CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore* semaphore)
+{
+	return getInternalObject(*semaphore);
+}
+
+inline CUexternalSemaphore getInternalObject(const core::smart_refctd_ptr<CCUDAImportedSemaphore>& semaphore)
+{
+	return getInternalObject(*semaphore);
+}
+
 }
 
 #define ASSERT_CUDA_SUCCESS(expr, handler) \
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 1fd88d1b04..623c07ec9e 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -6,6 +6,7 @@
 - Consumers can request native CUDA with `find_package(Nabla CONFIG COMPONENTS Core CUDAInteropNative)` and override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>`.
 - A consumer can use a newer compatible local CUDA SDK through `CUDAInteropNative` without rebuilding Nabla or the clean `CUDAInterop` target.
 - Rebuilds stay local: changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
+- Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`, so opt-in code can keep CUDA usage terse without moving CUDA types into clean headers.
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED)
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index d868b2eaa7..4c001ab6ce 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -36,10 +36,10 @@ using namespace nbl::video;
 
 	CUdeviceptr mappedVulkanMemory = 0;
 	if (importedFromVulkan)
-		cuda_native::getMappedBuffer(*importedFromVulkan,&mappedVulkanMemory);
+		cuda_native::getMappedBuffer(importedFromVulkan,&mappedVulkanMemory);
 
-	const CUdeviceptr cudaDevicePtr = cuda_native::getDeviceptr(*cudaMemory);
-	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::getInternalObject(*importedSemaphore):nullptr;
+	const CUdeviceptr cudaDevicePtr = cuda_native::getDeviceptr(cudaMemory);
+	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::getInternalObject(importedSemaphore):nullptr;
 	return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore;
 }
 
@@ -102,7 +102,7 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 		if (!handler)
 			return true;
 
-		const auto& devices = nbl::video::cuda_native::getAvailableDevices(*handler);
+		const auto& devices = nbl::video::cuda_native::getAvailableDevices(handler);
 		if (devices.empty())
 			return true;
 

From d85657e381ecd537aa20a16ab227aa38754083d4 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 16:48:06 +0200
Subject: [PATCH 10/51] Document CUDA interop target split

---
 src/nbl/ext/CUDAInterop/README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 623c07ec9e..a73b9d9c21 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -1,5 +1,8 @@
 # CUDA Interop Targets
 
+This extension keeps CUDA interop available without making CUDA a default public
+compile-time dependency of Nabla.
+
 - `Nabla::Nabla` stays CUDA-free. `find_package(Nabla CONFIG)` does not require the CUDA SDK.
 - `Nabla::ext::CUDAInterop` is the clean Nabla interop target. Its public headers do not include `cuda.h` or `nvrtc.h`, so consumers can use a CUDA-enabled Nabla package without installing the CUDA SDK.
 - `Nabla::ext::CUDAInteropNative` is the explicit raw CUDA opt-in target. It exposes `CUDAInteropNative.h`, CUDA Driver API and NVRTC types, and requires `CUDAToolkit`.
@@ -8,6 +11,18 @@
 - Rebuilds stay local: changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
 - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`, so opt-in code can keep CUDA usage terse without moving CUDA types into clean headers.
 
+## Design
+
+- The default Nabla package remains relocatable and usable on machines without the CUDA SDK.
+- CUDA is used privately to build the interop library. CUDA SDK headers become visible to consumers only when `CUDAInteropNative` is requested.
+- Clean interop headers expose Nabla concepts such as devices, exported memory, imported memory, and imported semaphores.
+- Native interop headers expose raw CUDA Driver API and NVRTC types for examples and applications that need direct CUDA work.
+- The split is intentionally similar to the OpenCV CUDA shape: common CUDA-facing headers stay clean, while raw CUDA access lives behind explicit opt-in accessor/native headers.
+- This avoids a transitive public compile-time dependency on CUDA while preserving the low-level workflow for kernels, `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, and external semaphores.
+- Package consumers can pick their own compatible CUDA SDK for native code without rebuilding Nabla or the clean interop library.
+
+## Usage
+
 ```cmake
 find_package(Nabla CONFIG REQUIRED)
 target_link_libraries(app PRIVATE Nabla::Nabla)

From 6e8c4f99399b3111c2800a8ffd5f36cd9b17c418 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 16:58:35 +0200
Subject: [PATCH 11/51] Trim CUDA interop README wording

---
 src/nbl/ext/CUDAInterop/README.md | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index a73b9d9c21..104f7f2eca 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -1,25 +1,22 @@
 # CUDA Interop Targets
 
-This extension keeps CUDA interop available without making CUDA a default public
-compile-time dependency of Nabla.
-
-- `Nabla::Nabla` stays CUDA-free. `find_package(Nabla CONFIG)` does not require the CUDA SDK.
-- `Nabla::ext::CUDAInterop` is the clean Nabla interop target. Its public headers do not include `cuda.h` or `nvrtc.h`, so consumers can use a CUDA-enabled Nabla package without installing the CUDA SDK.
-- `Nabla::ext::CUDAInteropNative` is the explicit raw CUDA opt-in target. It exposes `CUDAInteropNative.h`, CUDA Driver API and NVRTC types, and requires `CUDAToolkit`.
-- Consumers can request native CUDA with `find_package(Nabla CONFIG COMPONENTS Core CUDAInteropNative)` and override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>`.
-- A consumer can use a newer compatible local CUDA SDK through `CUDAInteropNative` without rebuilding Nabla or the clean `CUDAInterop` target.
-- Rebuilds stay local: changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
-- Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`, so opt-in code can keep CUDA usage terse without moving CUDA types into clean headers.
+- `Nabla::Nabla` does not require the CUDA SDK.
+- `Nabla::ext::CUDAInterop` provides Nabla CUDA interop types. Its public headers do not include `cuda.h` or `nvrtc.h`.
+- `Nabla::ext::CUDAInteropNative` provides raw CUDA Driver API and NVRTC access through `CUDAInteropNative.h`.
+- `CUDAInteropNative` requires `CUDAToolkit`. `CUDAInterop` does not expose that requirement to consumers.
+- Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInteropNative`.
+- Consumers can build native CUDA code against a compatible local SDK without rebuilding Nabla or `CUDAInterop`.
+- Changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
+- Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`.
 
 ## Design
 
-- The default Nabla package remains relocatable and usable on machines without the CUDA SDK.
-- CUDA is used privately to build the interop library. CUDA SDK headers become visible to consumers only when `CUDAInteropNative` is requested.
-- Clean interop headers expose Nabla concepts such as devices, exported memory, imported memory, and imported semaphores.
-- Native interop headers expose raw CUDA Driver API and NVRTC types for examples and applications that need direct CUDA work.
-- The split is intentionally similar to the OpenCV CUDA shape: common CUDA-facing headers stay clean, while raw CUDA access lives behind explicit opt-in accessor/native headers.
-- This avoids a transitive public compile-time dependency on CUDA while preserving the low-level workflow for kernels, `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, and external semaphores.
-- Package consumers can pick their own compatible CUDA SDK for native code without rebuilding Nabla or the clean interop library.
+- CUDA is used privately while building the interop library.
+- CUDA SDK headers become visible to consumers only through `CUDAInteropNative`.
+- `CUDAInterop` exposes Nabla concepts such as devices, exported memory, imported memory, and imported semaphores.
+- `CUDAInteropNative` exposes CUDA types such as `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, external semaphores, and NVRTC objects.
+- The target split follows the same general dependency shape used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header.
+- This avoids a transitive public compile-time dependency on CUDA from `Nabla::Nabla`.
 
 ## Usage
 

From 881e9b83c19388647336d56ef438f07b66781641 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 17:43:35 +0200
Subject: [PATCH 12/51] Move CUDA interop into Nabla

---
 cmake/NablaConfig.cmake.in                    |  17 +--
 examples_tests                                |   2 +-
 include/nbl/ext/CUDAInterop/CCUDADevice.h     |   2 +-
 .../ext/CUDAInterop/CCUDAExportableMemory.h   |   2 +-
 include/nbl/ext/CUDAInterop/CCUDAHandler.h    |   2 +-
 .../nbl/ext/CUDAInterop/CCUDAImportedMemory.h |   2 +-
 .../ext/CUDAInterop/CCUDAImportedSemaphore.h  |   2 +-
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   |  40 +++----
 src/nbl/CMakeLists.txt                        |  46 ++++----
 src/nbl/ext/CMakeLists.txt                    |   8 --
 src/nbl/ext/CUDAInterop/CMakeLists.txt        |  51 ++-------
 src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp  | 100 ++++++++++++++++++
 src/nbl/ext/CUDAInterop/README.md             |  28 +++--
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt  |  14 ++-
 .../ext/CUDAInterop/smoke/clean_opt_in.cpp    |   4 +-
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |   2 +-
 16 files changed, 183 insertions(+), 139 deletions(-)
 create mode 100644 src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp

diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in
index afff3dcccc..8b9f62e548 100644
--- a/cmake/NablaConfig.cmake.in
+++ b/cmake/NablaConfig.cmake.in
@@ -7,7 +7,6 @@ set(Nabla_DXC_GIT_INFO_JSON_FILE "${PACKAGE_PREFIX_DIR}/include/dxc_git_info.jso
 set(_NBL_NABLA_LOAD_CORE OFF)
 set(_NBL_NABLA_LOAD_NSC OFF)
 set(_NBL_NABLA_LOAD_CUDA_INTEROP OFF)
-set(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE OFF)
 set(_NBL_NABLA_COMPONENTS ${Nabla_FIND_COMPONENTS})
 set(_NBL_NABLA_HAS_CORE_EXPORTS OFF)
 set(_NBL_NABLA_HAS_NSC_EXPORTS OFF)
@@ -31,12 +30,6 @@ if(_NBL_NABLA_COMPONENTS)
       set(_NBL_NABLA_LOAD_CORE ON)
       set(_NBL_NABLA_LOAD_CUDA_INTEROP ON)
       set(Nabla_CUDAInterop_FOUND TRUE)
-    elseif(_NBL_NABLA_COMPONENT STREQUAL "CUDAInteropNative")
-      set(_NBL_NABLA_LOAD_CORE ON)
-      set(_NBL_NABLA_LOAD_CUDA_INTEROP ON)
-      set(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE ON)
-      set(Nabla_CUDAInterop_FOUND TRUE)
-      set(Nabla_CUDAInteropNative_FOUND TRUE)
     else()
       set("Nabla_${_NBL_NABLA_COMPONENT}_FOUND" FALSE)
     endif()
@@ -93,10 +86,6 @@ if(_NBL_NABLA_LOAD_NSC)
 endif()
 
 if(_NBL_NABLA_LOAD_CUDA_INTEROP)
-  _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
-endif()
-
-if(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE)
   include(CMakeFindDependencyMacro)
 
   if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "")
@@ -104,9 +93,9 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE)
   endif()
 
   find_dependency(CUDAToolkit 13.0 REQUIRED)
-  _nbl_try_include_component("CUDAInteropNative" "NablaCUDAInteropNativeExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_NATIVE_FOUND)
-  if(_NBL_NABLA_CUDA_INTEROP_NATIVE_FOUND AND TARGET Nabla::ext::CUDAInteropNative)
-    target_link_libraries(Nabla::ext::CUDAInteropNative INTERFACE CUDA::toolkit)
+  _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
+  if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
+    target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit)
   endif()
 endif()
 
diff --git a/examples_tests b/examples_tests
index dfa2b7ac39..3b59c9bc05 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit dfa2b7ac39c6b9ae94ae2eb70c8f6ec251a9715e
+Subproject commit 3b59c9bc05d8784277d3a18e11f423dcb8ae2b74
diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h
index 25c40e7ed6..7b994e053f 100644
--- a/include/nbl/ext/CUDAInterop/CCUDADevice.h
+++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h
@@ -22,7 +22,7 @@ namespace cuda_native
 struct SAccess;
 }
 
-class CCUDADevice : public core::IReferenceCounted
+class NBL_API2 CCUDADevice : public core::IReferenceCounted
 {
 	public:
 		struct SNativeState;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
index 5973c31fac..b331d6a258 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
@@ -26,7 +26,7 @@ enum class ECUDAMemoryLocation : uint32_t
 	HOST_NUMA_CURRENT = 4
 };
 
-class CCUDAExportableMemory : public core::IReferenceCounted
+class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 {
 	public:
 		struct SNativeState;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
index 063598a518..6a3cc6c496 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
@@ -25,7 +25,7 @@ namespace cuda_native
 struct SAccess;
 }
 
-class CCUDAHandler : public core::IReferenceCounted
+class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 {
 	public:
 		struct SNativeState;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
index 8a24f83907..adb803f12c 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
@@ -16,7 +16,7 @@ namespace cuda_native
 struct SAccess;
 }
 
-class CCUDAImportedMemory : public core::IReferenceCounted
+class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
 {
 	public:
 		struct SNativeState;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
index 3ee03fb045..894f2444c0 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
@@ -19,7 +19,7 @@ namespace cuda_native
 struct SAccess;
 }
 
-class CCUDAImportedSemaphore : public core::IReferenceCounted
+class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted
 {
 	public:
 		struct SNativeState;
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index ea6313f26b..b73f2ae252 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -143,8 +143,8 @@ struct SCUDADeviceInfo
 	int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
 };
 
-const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
-const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
+NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
+NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
 
 inline const CUDA& getCUDAFunctionTable(const CCUDAHandler* handler)
 {
@@ -166,14 +166,14 @@ inline const NVRTC& getNVRTCFunctionTable(const core::smart_refctd_ptr<CCUDAHand
 	return getNVRTCFunctionTable(*handler);
 }
 
-bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
-bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
-bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
+NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
+NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
+NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
 
 template<typename T>
 T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
 
-const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
+NBL_API2 const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
 
 inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler* handler)
 {
@@ -185,12 +185,12 @@ inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(const core::smar
 	return getAvailableDevices(*handler);
 }
 
-nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
 	return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames);
 }
-nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
 	return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames);
@@ -215,8 +215,8 @@ inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& han
 {
 	return createProgram(*handler,prog,file,headerCount,headerContents,includeNames);
 }
-nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
-nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
+NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
+NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
 
 struct ptx_and_nvrtcResult_t
 {
@@ -224,8 +224,8 @@ struct ptx_and_nvrtcResult_t
 	nvrtcResult result;
 };
 
-ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
-ptx_and_nvrtcResult_t compileDirectlyToPTX(
+NBL_API2 ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
+NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
@@ -238,7 +238,7 @@ inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 {
 	return compileDirectlyToPTX(handler,std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
 }
-ptx_and_nvrtcResult_t compileDirectlyToPTX(
+NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	CCUDAHandler& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
@@ -292,13 +292,13 @@ inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log);
 }
 
-CUdevice getInternalObject(const CCUDADevice& device);
-CUcontext getContext(const CCUDADevice& device);
-size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
-CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
-CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
-CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
-CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
+NBL_API2 CUdevice getInternalObject(const CCUDADevice& device);
+NBL_API2 CUcontext getContext(const CCUDADevice& device);
+NBL_API2 size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
+NBL_API2 CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
+NBL_API2 CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
+NBL_API2 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
+NBL_API2 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
 
 inline CUdevice getInternalObject(const CCUDADevice* device)
 {
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index ecf7f555c3..f0f7b275c0 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -124,6 +124,20 @@ set(NBL_CORE_SOURCES
 	core/alloc/refctd_memory_resource.cpp
 	core/hash/blake.cpp
 )
+
+set(NBL_CUDA_INTEROP_SOURCES
+	ext/CUDAInterop/CUDAInteropStubs.cpp
+)
+if(NBL_COMPILE_WITH_CUDA)
+	set(NBL_CUDA_INTEROP_SOURCES
+		ext/CUDAInterop/CCUDADevice.cpp
+		ext/CUDAInterop/CCUDAExportableMemory.cpp
+		ext/CUDAInterop/CCUDAHandler.cpp
+		ext/CUDAInterop/CCUDAImportedMemory.cpp
+		ext/CUDAInterop/CCUDAImportedSemaphore.cpp
+	)
+endif()
+
 set(NBL_SYSTEM_SOURCES
 	system/DefaultFuncPtrLoader.cpp
 	system/IFileBase.cpp
@@ -306,6 +320,7 @@ set(NABLA_SRCS_COMMON
 	${NBL_VIDEO_SOURCES}
 	${NBL_SCENE_SOURCES}
 	${NBL_META_SOURCES}
+	${NBL_CUDA_INTEROP_SOURCES}
 )
 
 if(MSVC)
@@ -416,6 +431,11 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES)
 	target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES)
 endif()
 
+if(NBL_COMPILE_WITH_CUDA)
+	target_compile_definitions(Nabla PRIVATE _NBL_COMPILE_WITH_CUDA_)
+	target_include_directories(Nabla PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+endif()
+
 set(INTERFACE_BUILD_DEFINITIONS
 	_DXC_DLL_="${DXC_DLL}"
 )
@@ -783,35 +803,17 @@ add_subdirectory(ext EXCLUDE_FROM_ALL)
 propagate_changed_variables_to_parent_scope()
 
 if(DEFINED NBL_EXT_CUDA_INTEROP_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_LIB})
-	set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXCLUDE_FROM_ALL OFF)
-
-	set(_NBL_EXT_CUDA_INTEROP_INSTALL_ARGS)
-	if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD)
-		list(APPEND _NBL_EXT_CUDA_INTEROP_INSTALL_ARGS EXPORT NablaCUDAInteropExportTargets)
-	endif()
-	nbl_install_lib_spec(${NBL_EXT_CUDA_INTEROP_LIB} "nbl/ext/CUDA_INTEROP" ${_NBL_EXT_CUDA_INTEROP_INSTALL_ARGS})
-
 	if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD)
+		install(TARGETS ${NBL_EXT_CUDA_INTEROP_LIB}
+			EXPORT NablaCUDAInteropExportTargets
+			COMPONENT Libraries
+		)
 		install(EXPORT NablaCUDAInteropExportTargets
 			NAMESPACE Nabla::
 			DESTINATION cmake
 			COMPONENT Libraries
 		)
 	endif()
-
-	if(DEFINED NBL_EXT_CUDA_INTEROP_NATIVE_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB})
-		if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD)
-			install(TARGETS ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}
-				EXPORT NablaCUDAInteropNativeExportTargets
-				COMPONENT Libraries
-			)
-			install(EXPORT NablaCUDAInteropNativeExportTargets
-				NAMESPACE Nabla::
-				DESTINATION cmake
-				COMPONENT Libraries
-			)
-		endif()
-	endif()
 endif()
 
 if(TARGET ${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB})
diff --git a/src/nbl/ext/CMakeLists.txt b/src/nbl/ext/CMakeLists.txt
index 59ae49285e..264cfc7c2d 100644
--- a/src/nbl/ext/CMakeLists.txt
+++ b/src/nbl/ext/CMakeLists.txt
@@ -40,18 +40,10 @@ endif()
 
 add_subdirectory(CUDAInterop)
 if (NBL_COMPILE_WITH_CUDA)
-    set(NBL_EXT_CUDA_INTEROP_INCLUDE_DIRS
-        ${NBL_EXT_CUDA_INTEROP_INCLUDE_DIRS}
-        PARENT_SCOPE
-    )
     set(NBL_EXT_CUDA_INTEROP_LIB
         ${NBL_EXT_CUDA_INTEROP_LIB}
         PARENT_SCOPE
     )
-    set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB
-        ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}
-        PARENT_SCOPE
-    )
 endif()
 
 if (NBL_BUILD_IMGUI)
diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt
index 973fbb232a..438ab51d8f 100644
--- a/src/nbl/ext/CUDAInterop/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt
@@ -1,50 +1,17 @@
 include(${NBL_ROOT_PATH}/cmake/common.cmake)
 
 if (NBL_COMPILE_WITH_CUDA)
-	set(NBL_EXT_INTERNAL_INCLUDE_DIR "${NBL_ROOT_PATH}/include/nbl/ext/CUDAInterop")
+	set(NBL_EXT_CUDA_INTEROP_LIB "NblExtCUDA_INTEROP")
 
-	set(NBL_EXT_CUDA_INTEROP_H
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInterop.h
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInteropNative.h
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDADevice.h
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAExportableMemory.h
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAHandler.h
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAImportedMemory.h
-		${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAImportedSemaphore.h
+	add_library(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE)
+	target_link_libraries(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE
+		$<BUILD_INTERFACE:Nabla>
+		$<BUILD_INTERFACE:CUDA::toolkit>
+		$<INSTALL_INTERFACE:Nabla::Nabla>
 	)
-
-	set(NBL_EXT_CUDA_INTEROP_SRC
-		CCUDADevice.cpp
-		CCUDAExportableMemory.cpp
-		CCUDAHandler.cpp
-		CCUDAImportedMemory.cpp
-		CCUDAImportedSemaphore.cpp
-	)
-
-	nbl_create_ext_library_project(
-		CUDA_INTEROP
-		"${NBL_EXT_CUDA_INTEROP_H}"
-		"${NBL_EXT_CUDA_INTEROP_SRC}"
-		""
-		""
-		""
-	)
-
-	target_compile_definitions(${LIB_NAME} PRIVATE _NBL_COMPILE_WITH_CUDA_)
-	target_include_directories(${LIB_NAME} PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
-	set_target_properties(${LIB_NAME} PROPERTIES EXPORT_NAME "ext::CUDAInterop")
-	add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME})
-
-	set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB "NblExtCUDA_INTEROP_NATIVE")
-	add_library(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} INTERFACE)
-	target_link_libraries(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} INTERFACE
-		$<BUILD_INTERFACE:${LIB_NAME}>
-		$<INSTALL_INTERFACE:Nabla::ext::CUDAInterop>
-		CUDA::toolkit
-	)
-	set_target_properties(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInteropNative")
-	add_library(Nabla::ext::CUDAInteropNative ALIAS ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB})
-	set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB "${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}" PARENT_SCOPE)
+	set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInterop")
+	add_library(Nabla::ext::CUDAInterop ALIAS ${NBL_EXT_CUDA_INTEROP_LIB})
+	set(NBL_EXT_CUDA_INTEROP_LIB "${NBL_EXT_CUDA_INTEROP_LIB}" PARENT_SCOPE)
 endif()
 
 add_subdirectory(smoke)
diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp b/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp
new file mode 100644
index 0000000000..db2b068391
--- /dev/null
+++ b/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp
@@ -0,0 +1,100 @@
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+
+namespace nbl::video
+{
+
+struct CCUDAHandler::SNativeState {};
+struct CCUDADevice::SNativeState {};
+struct CCUDAExportableMemory::SNativeState {};
+struct CCUDAImportedMemory::SNativeState {};
+struct CCUDAImportedSemaphore::SNativeState {};
+
+CCUDAHandler::CCUDAHandler(
+	std::unique_ptr<SNativeState>&& nativeState,
+	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers,
+	core::smart_refctd_ptr<system::ILogger>&& _logger,
+	int _version)
+	: m_native(std::move(nativeState))
+	, m_headers(std::move(_headers))
+	, m_logger(std::move(_logger))
+	, m_version(_version)
+{}
+
+CCUDAHandler::~CCUDAHandler() = default;
+
+core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr<system::ILogger>&&)
+{
+	return nullptr;
+}
+
+core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&&, IPhysicalDevice*)
+{
+	return nullptr;
+}
+
+CCUDADevice::CCUDADevice(
+	core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection,
+	IPhysicalDevice* const vulkanDevice,
+	const E_VIRTUAL_ARCHITECTURE virtualArchitecture,
+	std::unique_ptr<SNativeState>&& nativeState,
+	core::smart_refctd_ptr<CCUDAHandler>&& handler)
+	: m_logger(nullptr)
+	, m_vulkanConnection(std::move(vulkanConnection))
+	, m_physicalDevice(vulkanDevice)
+	, m_virtualArchitecture(virtualArchitecture)
+	, m_handler(std::move(handler))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDADevice::~CCUDADevice() = default;
+
+size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation, size_t size) const
+{
+	return size;
+}
+
+core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&&)
+{
+	return nullptr;
+}
+
+core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&&)
+{
+	return nullptr;
+}
+
+core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&&)
+{
+	return nullptr;
+}
+
+CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_params(std::move(params))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDAExportableMemory::~CCUDAExportableMemory() = default;
+
+core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const
+{
+	return nullptr;
+}
+
+CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_src(std::move(src))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDAImportedMemory::~CCUDAImportedMemory() = default;
+
+CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_src(std::move(src))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default;
+
+}
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 104f7f2eca..6eee617714 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -1,21 +1,22 @@
 # CUDA Interop Targets
 
 - `Nabla::Nabla` does not require the CUDA SDK.
-- `Nabla::ext::CUDAInterop` provides Nabla CUDA interop types. Its public headers do not include `cuda.h` or `nvrtc.h`.
-- `Nabla::ext::CUDAInteropNative` provides raw CUDA Driver API and NVRTC access through `CUDAInteropNative.h`.
-- `CUDAInteropNative` requires `CUDAToolkit`. `CUDAInterop` does not expose that requirement to consumers.
-- Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInteropNative`.
-- Consumers can build native CUDA code against a compatible local SDK without rebuilding Nabla or `CUDAInterop`.
+- `Nabla::Nabla` provides Nabla CUDA interop types when the package was built with CUDA support.
+- Nabla CUDA interop public headers do not include `cuda.h` or `nvrtc.h`.
+- `Nabla::ext::CUDAInterop` is the raw CUDA Driver API and NVRTC opt-in target.
+- `Nabla::ext::CUDAInterop` requires `CUDAToolkit` and exposes `CUDAInteropNative.h`.
+- Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInterop`.
+- Consumers can build native CUDA code against a compatible local SDK without rebuilding Nabla.
 - Changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
 - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`.
 
 ## Design
 
-- CUDA is used privately while building the interop library.
-- CUDA SDK headers become visible to consumers only through `CUDAInteropNative`.
-- `CUDAInterop` exposes Nabla concepts such as devices, exported memory, imported memory, and imported semaphores.
-- `CUDAInteropNative` exposes CUDA types such as `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, external semaphores, and NVRTC objects.
-- The target split follows the same general dependency shape used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header.
+- CUDA is used privately while building `Nabla::Nabla`.
+- CUDA SDK headers become visible to consumers only through `Nabla::ext::CUDAInterop`.
+- `Nabla::Nabla` exposes Nabla concepts such as devices, exported memory, imported memory, and imported semaphores.
+- `Nabla::ext::CUDAInterop` exposes CUDA types such as `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, external semaphores, and NVRTC objects.
+- The dependency shape follows the same general model used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header.
 - This avoids a transitive public compile-time dependency on CUDA from `Nabla::Nabla`.
 
 ## Usage
@@ -27,10 +28,5 @@ target_link_libraries(app PRIVATE Nabla::Nabla)
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInterop)
-target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop)
-```
-
-```cmake
-find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInteropNative)
-target_link_libraries(app PRIVATE Nabla::ext::CUDAInteropNative)
+target_link_libraries(native_app PRIVATE Nabla::ext::CUDAInterop)
 ```
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index 71bdac260d..bdda95fb03 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -4,9 +4,9 @@ project(NblExtCUDAInteropSmoke CXX)
 option(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE "Build the CUDA native opt-in smoke from an installed Nabla package." OFF)
 
 if(NOT TARGET Nabla::Nabla)
-	set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core CUDAInterop)
+	set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core)
 	if(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE)
-		list(APPEND _NBL_CUDA_INTEROP_SMOKE_COMPONENTS CUDAInteropNative)
+		list(APPEND _NBL_CUDA_INTEROP_SMOKE_COMPONENTS CUDAInterop)
 	endif()
 	find_package(Nabla REQUIRED CONFIG COMPONENTS ${_NBL_CUDA_INTEROP_SMOKE_COMPONENTS})
 endif()
@@ -23,12 +23,10 @@ endfunction()
 nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp)
 target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla)
 
-if(TARGET Nabla::ext::CUDAInterop)
-	nbl_add_cuda_interop_smoke(NblExtCUDAInteropCleanOptInSmoke clean_opt_in.cpp)
-	target_link_libraries(NblExtCUDAInteropCleanOptInSmoke PRIVATE Nabla::ext::CUDAInterop)
-endif()
+nbl_add_cuda_interop_smoke(NblExtCUDAInteropCleanNablaSmoke clean_opt_in.cpp)
+target_link_libraries(NblExtCUDAInteropCleanNablaSmoke PRIVATE Nabla::Nabla)
 
-if(TARGET Nabla::ext::CUDAInteropNative)
+if(TARGET Nabla::ext::CUDAInterop)
 	nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp)
-	target_link_libraries(NblExtCUDAInteropNativeOptInSmoke PRIVATE Nabla::ext::CUDAInteropNative)
+	target_link_libraries(NblExtCUDAInteropNativeOptInSmoke PRIVATE Nabla::ext::CUDAInterop)
 endif()
diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
index 6952433f9e..348caa766e 100644
--- a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
@@ -4,11 +4,11 @@
 #include <type_traits>
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
-#error "Nabla::ext::CUDAInterop must not propagate the CUDA build define."
+#error "Nabla::Nabla must not propagate the CUDA build define."
 #endif
 
 #ifdef CUDA_VERSION
-#error "Nabla::ext::CUDAInterop must not require CUDA SDK headers."
+#error "Nabla::Nabla must not require CUDA SDK headers."
 #endif
 
 namespace
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index 4c001ab6ce..a78f710040 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -8,7 +8,7 @@
 #include <utility>
 
 #ifndef CUDA_VERSION
-#error "Nabla::ext::CUDAInteropNative must expose CUDA SDK headers."
+#error "Nabla::ext::CUDAInterop must expose CUDA SDK headers."
 #endif
 
 namespace

From 5dd1134ffc7d144e24f0ee3a55a283025b01fed8 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 06:15:03 +0200
Subject: [PATCH 13/51] Document CUDA interop accessor model

---
 src/nbl/ext/CUDAInterop/README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 6eee617714..a7c1e654be 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -19,6 +19,14 @@
 - The dependency shape follows the same general model used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header.
 - This avoids a transitive public compile-time dependency on CUDA from `Nabla::Nabla`.
 
+## OpenCV Reference
+
+- OpenCV's common CUDA header includes OpenCV headers, not raw CUDA SDK headers: [`cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda.hpp#L51-L52).
+- OpenCV keeps the public stream type as an OpenCV abstraction and grants access through `StreamAccessor`: [`cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda.hpp#L916-L979).
+- OpenCV's raw CUDA opt-in header says it is the only header that depends on the CUDA Runtime API, then includes `<cuda_runtime.h>` and exposes accessor types: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79).
+- OpenCV also keeps implementation CUDA headers private and includes `<cuda.h>` / `<cuda_runtime.h>` there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61).
+- The same split is used here: Nabla CUDA objects stay in `Nabla::Nabla`, and raw CUDA handles/functions are available only after including `CUDAInteropNative.h` and linking `Nabla::ext::CUDAInterop`.
+
 ## Usage
 
 ```cmake

From e514df7f505bbc168f13ccc50750a62c2e6680bf Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 06:41:06 +0200
Subject: [PATCH 14/51] Inline CUDA interop stubs

---
 src/nbl/CMakeLists.txt                        |  15 +--
 src/nbl/ext/CUDAInterop/CCUDADevice.cpp       |  50 ++++++++-
 .../ext/CUDAInterop/CCUDAExportableMemory.cpp |  27 ++++-
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      |  38 ++++++-
 .../ext/CUDAInterop/CCUDAImportedMemory.cpp   |  21 +++-
 .../CUDAInterop/CCUDAImportedSemaphore.cpp    |  22 +++-
 src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp  | 100 ------------------
 src/nbl/ext/CUDAInterop/README.md             |  12 +++
 8 files changed, 169 insertions(+), 116 deletions(-)
 delete mode 100644 src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp

diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index f0f7b275c0..ccb600ca32 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -126,17 +126,12 @@ set(NBL_CORE_SOURCES
 )
 
 set(NBL_CUDA_INTEROP_SOURCES
-	ext/CUDAInterop/CUDAInteropStubs.cpp
+	ext/CUDAInterop/CCUDADevice.cpp
+	ext/CUDAInterop/CCUDAExportableMemory.cpp
+	ext/CUDAInterop/CCUDAHandler.cpp
+	ext/CUDAInterop/CCUDAImportedMemory.cpp
+	ext/CUDAInterop/CCUDAImportedSemaphore.cpp
 )
-if(NBL_COMPILE_WITH_CUDA)
-	set(NBL_CUDA_INTEROP_SOURCES
-		ext/CUDAInterop/CCUDADevice.cpp
-		ext/CUDAInterop/CCUDAExportableMemory.cpp
-		ext/CUDAInterop/CCUDAHandler.cpp
-		ext/CUDAInterop/CCUDAImportedMemory.cpp
-		ext/CUDAInterop/CCUDAImportedSemaphore.cpp
-	)
-endif()
 
 set(NBL_SYSTEM_SOURCES
 	system/DefaultFuncPtrLoader.cpp
diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
index 5f59545173..7d002c86ca 100644
--- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
@@ -1,13 +1,15 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+
+#ifdef _NBL_COMPILE_WITH_CUDA_
 #include "CUDAInteropNativeState.hpp"
 
 #ifdef _WIN32
 #include <winternl.h>
 #endif
 
-#ifdef _NBL_COMPILE_WITH_CUDA_
 namespace nbl::video
 {
 
@@ -241,4 +243,50 @@ CCUDADevice::~CCUDADevice()
 
 }
 
+#else
+
+namespace nbl::video
+{
+
+// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols.
+struct CCUDADevice::SNativeState {};
+
+CCUDADevice::CCUDADevice(
+	core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection,
+	IPhysicalDevice* const vulkanDevice,
+	const E_VIRTUAL_ARCHITECTURE virtualArchitecture,
+	std::unique_ptr<SNativeState>&& nativeState,
+	core::smart_refctd_ptr<CCUDAHandler>&& handler)
+	: m_logger(nullptr)
+	, m_vulkanConnection(std::move(vulkanConnection))
+	, m_physicalDevice(vulkanDevice)
+	, m_virtualArchitecture(virtualArchitecture)
+	, m_handler(std::move(handler))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDADevice::~CCUDADevice() = default;
+
+size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation, size_t size) const
+{
+	return size;
+}
+
+core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&&)
+{
+	return nullptr;
+}
+
+core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&&)
+{
+	return nullptr;
+}
+
+core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&&)
+{
+	return nullptr;
+}
+
+}
+
 #endif // _NBL_COMPILE_WITH_CUDA_
diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
index 94d18c40bb..a89e42b2f6 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
@@ -2,9 +2,11 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "CUDAInteropNativeState.hpp"
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
+#include "CUDAInteropNativeState.hpp"
+
 namespace nbl::video
 {
 
@@ -66,4 +68,27 @@ CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory)
 }
 }
 
+#else
+
+namespace nbl::video
+{
+
+// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols.
+struct CCUDAExportableMemory::SNativeState {};
+
+CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_params(std::move(params))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDAExportableMemory::~CCUDAExportableMemory() = default;
+
+core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const
+{
+	return nullptr;
+}
+
+}
+
 #endif // _NBL_COMPILE_WITH_CUDA_
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 49e36083d4..51f0656f6c 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -2,10 +2,11 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "CUDAInteropNativeState.hpp"
-#include "nbl/system/CFileView.h"
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
+#include "CUDAInteropNativeState.hpp"
+#include "nbl/system/CFileView.h"
 #include "jitify/jitify.hpp"
 
 
@@ -769,4 +770,37 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 
 }
 
+#else
+
+namespace nbl::video
+{
+
+// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols.
+struct CCUDAHandler::SNativeState {};
+
+CCUDAHandler::CCUDAHandler(
+	std::unique_ptr<SNativeState>&& nativeState,
+	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers,
+	core::smart_refctd_ptr<system::ILogger>&& _logger,
+	int _version)
+	: m_native(std::move(nativeState))
+	, m_headers(std::move(_headers))
+	, m_logger(std::move(_logger))
+	, m_version(_version)
+{}
+
+CCUDAHandler::~CCUDAHandler() = default;
+
+core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr<system::ILogger>&&)
+{
+	return nullptr;
+}
+
+core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&&, IPhysicalDevice*)
+{
+	return nullptr;
+}
+
+}
+
 #endif // _NBL_COMPILE_WITH_CUDA_
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
index bbc65f91ab..8de3ce3e63 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
@@ -2,9 +2,10 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "CUDAInteropNativeState.hpp"
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
+#include "CUDAInteropNativeState.hpp"
 
 namespace nbl::video
 {
@@ -44,4 +45,22 @@ CCUDAImportedMemory::~CCUDAImportedMemory()
 
 }
 
+#else
+
+namespace nbl::video
+{
+
+// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols.
+struct CCUDAImportedMemory::SNativeState {};
+
+CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_src(std::move(src))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDAImportedMemory::~CCUDAImportedMemory() = default;
+
+}
+
 #endif
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
index b6e3b319f7..fdbb56b0cf 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
@@ -2,9 +2,11 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "CUDAInteropNativeState.hpp"
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
+#include "CUDAInteropNativeState.hpp"
+
 namespace nbl::video
 {
 CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState)
@@ -30,4 +32,22 @@ CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
 }
 }
 
+#else
+
+namespace nbl::video
+{
+
+// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols.
+struct CCUDAImportedSemaphore::SNativeState {};
+
+CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState)
+	: m_device(std::move(device))
+	, m_src(std::move(src))
+	, m_native(std::move(nativeState))
+{}
+
+CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default;
+
+}
+
 #endif // _NBL_COMPILE_WITH_CUDA_
diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp b/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp
deleted file mode 100644
index db2b068391..0000000000
--- a/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
-
-namespace nbl::video
-{
-
-struct CCUDAHandler::SNativeState {};
-struct CCUDADevice::SNativeState {};
-struct CCUDAExportableMemory::SNativeState {};
-struct CCUDAImportedMemory::SNativeState {};
-struct CCUDAImportedSemaphore::SNativeState {};
-
-CCUDAHandler::CCUDAHandler(
-	std::unique_ptr<SNativeState>&& nativeState,
-	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers,
-	core::smart_refctd_ptr<system::ILogger>&& _logger,
-	int _version)
-	: m_native(std::move(nativeState))
-	, m_headers(std::move(_headers))
-	, m_logger(std::move(_logger))
-	, m_version(_version)
-{}
-
-CCUDAHandler::~CCUDAHandler() = default;
-
-core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr<system::ILogger>&&)
-{
-	return nullptr;
-}
-
-core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&&, IPhysicalDevice*)
-{
-	return nullptr;
-}
-
-CCUDADevice::CCUDADevice(
-	core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection,
-	IPhysicalDevice* const vulkanDevice,
-	const E_VIRTUAL_ARCHITECTURE virtualArchitecture,
-	std::unique_ptr<SNativeState>&& nativeState,
-	core::smart_refctd_ptr<CCUDAHandler>&& handler)
-	: m_logger(nullptr)
-	, m_vulkanConnection(std::move(vulkanConnection))
-	, m_physicalDevice(vulkanDevice)
-	, m_virtualArchitecture(virtualArchitecture)
-	, m_handler(std::move(handler))
-	, m_native(std::move(nativeState))
-{}
-
-CCUDADevice::~CCUDADevice() = default;
-
-size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation, size_t size) const
-{
-	return size;
-}
-
-core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&&)
-{
-	return nullptr;
-}
-
-core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&&)
-{
-	return nullptr;
-}
-
-core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&&)
-{
-	return nullptr;
-}
-
-CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState)
-	: m_device(std::move(device))
-	, m_params(std::move(params))
-	, m_native(std::move(nativeState))
-{}
-
-CCUDAExportableMemory::~CCUDAExportableMemory() = default;
-
-core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const
-{
-	return nullptr;
-}
-
-CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState)
-	: m_device(std::move(device))
-	, m_src(std::move(src))
-	, m_native(std::move(nativeState))
-{}
-
-CCUDAImportedMemory::~CCUDAImportedMemory() = default;
-
-CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState)
-	: m_device(std::move(device))
-	, m_src(std::move(src))
-	, m_native(std::move(nativeState))
-{}
-
-CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default;
-
-}
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index a7c1e654be..407b5e81b3 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -38,3 +38,15 @@ target_link_libraries(app PRIVATE Nabla::Nabla)
 find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInterop)
 target_link_libraries(native_app PRIVATE Nabla::ext::CUDAInterop)
 ```
+
+## Properties
+
+- `Nabla::Nabla` can be built with CUDA support without making CUDA SDK headers a public compile-time requirement.
+- Consumers that only link `Nabla::Nabla` do not need a CUDA SDK to parse Nabla headers.
+- Consumers that need raw CUDA include `CUDAInteropNative.h` and link `Nabla::ext::CUDAInterop` explicitly.
+- Raw CUDA access is not wrapped away. Native code can use CUDA Driver API types, NVRTC types, and Nabla native accessors in the opt-in path.
+- The Nabla source list is stable. CUDA interop `.cpp` files stay visible in IDE projects for CUDA ON and CUDA OFF builds.
+- CUDA OFF implementations are local stubs in the same `.cpp` files. Clean API entry points stay linkable and return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
+- CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`, so CUDA OFF builds do not need `cuda.h` or `nvrtc.h`.
+- A package built with CUDA support can be consumed without a local CUDA SDK unless the `CUDAInterop` component is requested.
+- A consumer can use a compatible local CUDA SDK for native interop without rebuilding Nabla.

From e53c838207aaf5f15513e6e622038d852154cfbb Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 09:25:29 +0200
Subject: [PATCH 15/51] Refine CUDA interop boundary

---
 examples_tests                                |  2 +-
 include/nbl/ext/CUDAInterop/CCUDADevice.h     |  4 --
 .../ext/CUDAInterop/CCUDAExportableMemory.h   | 18 +-----
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 21 +++++++
 src/nbl/ext/CUDAInterop/CCUDADevice.cpp       | 60 +++++++++---------
 .../ext/CUDAInterop/CCUDAExportableMemory.cpp | 12 ++--
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      | 52 ++++++++++++++-
 .../CUDAInterop/CUDAInteropNativeState.hpp    | 10 ---
 src/nbl/ext/CUDAInterop/README.md             | 63 ++++++++++---------
 .../ext/CUDAInterop/smoke/clean_opt_in.cpp    | 13 ++--
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |  4 +-
 11 files changed, 147 insertions(+), 112 deletions(-)

diff --git a/examples_tests b/examples_tests
index 3b59c9bc05..fbb82d36e0 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 3b59c9bc05d8784277d3a18e11f423dcb8ae2b74
+Subproject commit fbb82d36e0f767e867a477a9d1a7035c7cbd56ca
diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h
index 7b994e053f..12465f40f4 100644
--- a/include/nbl/ext/CUDAInterop/CCUDADevice.h
+++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h
@@ -81,10 +81,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 
 		bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); }
 
-		size_t roundToGranularity(ECUDAMemoryLocation location, size_t size) const;
-
-		core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams);
-
 		core::smart_refctd_ptr<CCUDAImportedMemory> importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem);
 
 		core::smart_refctd_ptr<CCUDAImportedSemaphore> importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sem);
diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
index b331d6a258..80a9b3630a 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
@@ -18,36 +18,22 @@ namespace cuda_native
 struct SAccess;
 }
 
-enum class ECUDAMemoryLocation : uint32_t
-{
-	DEVICE = 1,
-	HOST = 2,
-	HOST_NUMA = 3,
-	HOST_NUMA_CURRENT = 4
-};
-
 class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 {
 	public:
 		struct SNativeState;
-		struct SCreationParams
+		struct SCachedCreationParams
 		{
 			size_t size;
 			uint32_t alignment;
-			ECUDAMemoryLocation location;
-		};
-
-		struct SCachedCreationParams : SCreationParams
-		{
 			size_t granularSize;
 			external_handle_t externalHandle;
+			bool deviceLocal;
 		};
 
 		CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState);
 		~CCUDAExportableMemory() override;
 
-		const SCreationParams& getCreationParams() const { return m_params; }
-
 		core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
 
 	private:
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index b73f2ae252..dd87d93e43 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -18,6 +18,9 @@
 namespace nbl::video::cuda_native
 {
 
+inline constexpr int MinimumCUDADriverVersion = 13000;
+inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000;
+
 using LibLoader = system::DefaultFuncPtrLoader;
 
 NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader
@@ -143,6 +146,13 @@ struct SCUDADeviceInfo
 	int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
 };
 
+struct SExportableMemoryCreationParams
+{
+	size_t size;
+	uint32_t alignment;
+	CUmemLocationType location;
+};
+
 NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
 NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
 
@@ -295,6 +305,7 @@ inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 NBL_API2 CUdevice getInternalObject(const CCUDADevice& device);
 NBL_API2 CUcontext getContext(const CCUDADevice& device);
 NBL_API2 size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
+NBL_API2 core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params);
 NBL_API2 CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
 NBL_API2 CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
 NBL_API2 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
@@ -330,6 +341,16 @@ inline size_t roundToGranularity(const core::smart_refctd_ptr<CCUDADevice>& devi
 	return roundToGranularity(*device,location,size);
 }
 
+inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice* device, SExportableMemoryCreationParams&& params)
+{
+	return createExportableMemory(*device,std::move(params));
+}
+
+inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(const core::smart_refctd_ptr<CCUDADevice>& device, SExportableMemoryCreationParams&& params)
+{
+	return createExportableMemory(*device,std::move(params));
+}
+
 inline CUdeviceptr getDeviceptr(const CCUDAExportableMemory* memory)
 {
 	return getDeviceptr(*memory);
diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
index 7d002c86ca..ebac00b7b4 100644
--- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
@@ -58,11 +58,6 @@ CCUDADevice::CCUDADevice(
 	}
 }
 
-size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation location, size_t size) const
-{
-	return cuda_native::roundToGranularity(*this,cuda_native::toNative(location),size);
-}
-
 namespace cuda_native
 {
 
@@ -84,6 +79,11 @@ size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location,
 
 }
 
+static bool isDeviceLocal(CUmemLocationType location)
+{
+	return location==CU_MEM_LOCATION_TYPE_DEVICE;
+}
+
 static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory)
 {
 	const auto handler = device.getHandler();
@@ -117,12 +117,23 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 	return CUDA_SUCCESS;
 }
 
-core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams)
+namespace cuda_native
 {
-	CCUDAExportableMemory::SCachedCreationParams params = { inParams };
 
-	auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
-	const auto nativeLocation = cuda_native::toNative(params.location);
+core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& inParams)
+{
+	const auto handler = device.getHandler();
+	auto& native = SAccess::native(device);
+	auto logger = SAccess::logger(device);
+
+	CCUDAExportableMemory::SCachedCreationParams params = {
+		.size = inParams.size,
+		.alignment = inParams.alignment,
+		.granularSize = roundToGranularity(device, inParams.location, inParams.size),
+		.deviceLocal = isDeviceLocal(inParams.location)
+	};
+
+	auto& cu = getCUDAFunctionTable(*handler);
 	
 #ifdef _WIN32
 	OBJECT_ATTRIBUTES metadata = {
@@ -132,35 +143,34 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 
 	 const auto prop = CUmemAllocationProp{
 		.type = CU_MEM_ALLOCATION_TYPE_PINNED,
-		.requestedHandleTypes = cuda_native::getAllocationHandleType(),
-		.location = { .type = nativeLocation, .id = m_native->handle },
+		.requestedHandleTypes = getAllocationHandleType(),
+		.location = { .type = inParams.location, .id = native.handle },
 #ifdef _WIN32
 		.win32HandleMetaData = &metadata,
 #endif
 	};
 
-	params.granularSize = roundToGranularity(params.location, params.size);
 	auto nativeState = std::make_unique<CCUDAExportableMemory::SNativeState>();
 
 	CUmemGenericAllocationHandle mem;
 	if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err)
 	{
-		m_logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR);
+		logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR);
 		return nullptr;
 	}
 	
 	if (auto err = cu.pcuMemExportToShareableHandle(&params.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err)
 	{
-		m_logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR);
-		ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler);
+		logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR);
+		ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), handler);
 		return nullptr;
 	}
 
-	if (const auto err = reserveAddressAndMapMemory(*this,&nativeState->ptr, params.granularSize, params.alignment, nativeLocation, mem); CUDA_SUCCESS != err)
+	if (const auto err = reserveAddressAndMapMemory(device,&nativeState->ptr, params.granularSize, params.alignment, inParams.location, mem); CUDA_SUCCESS != err)
 	{
-		m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
+		logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
 
-		ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler);
+		ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), handler);
 
 		bool closeSucceed = CloseExternalHandle(params.externalHandle);
 		assert(closeSucceed);
@@ -175,7 +185,9 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 		return nullptr;
 	}
 	
-	return core::make_smart_refctd_ptr<CCUDAExportableMemory>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(params), std::move(nativeState));
+	return core::make_smart_refctd_ptr<CCUDAExportableMemory>(core::smart_refctd_ptr<CCUDADevice>(&device), std::move(params), std::move(nativeState));
+}
+
 }
 
 core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem)
@@ -267,16 +279,6 @@ CCUDADevice::CCUDADevice(
 
 CCUDADevice::~CCUDADevice() = default;
 
-size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation, size_t size) const
-{
-	return size;
-}
-
-core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&&)
-{
-	return nullptr;
-}
-
 core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&&)
 {
 	return nullptr;
diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
index a89e42b2f6..a65d1b680c 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
@@ -22,14 +22,10 @@ core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsM
 	uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1;
 	uint32_t vram = pd->getDeviceLocalMemoryTypeBits();
 
-	switch (m_params.location)
-	{
-    case ECUDAMemoryLocation::DEVICE: memoryTypeBits &=  vram; break;
-    case ECUDAMemoryLocation::HOST_NUMA:
-    case ECUDAMemoryLocation::HOST_NUMA_CURRENT:
-    case ECUDAMemoryLocation::HOST:   memoryTypeBits &= ~vram; break;
-    default: break;
-	}
+	if (m_params.deviceLocal)
+		memoryTypeBits &= vram;
+	else
+		memoryTypeBits &= ~vram;
 
 	IDeviceMemoryBacked::SDeviceMemoryRequirements req = {};
 	req.size = m_params.granularSize;
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 51f0656f6c..777a1db14a 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -12,6 +12,21 @@
 
 namespace nbl::video
 {
+
+namespace
+{
+
+int cudaVersionMajor(int version)
+{
+	return version/1000;
+}
+
+int cudaVersionMinor(int version)
+{
+	return (version%1000)/10;
+}
+
+}
 	
 CCUDAHandler::CCUDAHandler(
 	std::unique_ptr<SNativeState>&& nativeState,
@@ -455,6 +470,8 @@ bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
 
 core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger)
 {
+	const system::logger_opt_ptr logger(_logger.get());
+
 	cuda_native::CUDA cuda = cuda_native::CUDA(
 		#if defined(_NBL_WINDOWS_API_)
 			"nvcuda"
@@ -502,18 +519,32 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 	#define SAFE_CUDA_CALL(FUNC,...) \
 	{\
 		if (!cuda.p ## FUNC)\
+		{\
+			logger.log("CCUDAHandler: CUDA Driver API function %s was not found. Need CUDA driver runtime %d.%d or newer.",system::ILogger::ELL_ERROR,#FUNC,cudaVersionMajor(cuda_native::MinimumCUDADriverVersion),cudaVersionMinor(cuda_native::MinimumCUDADriverVersion));\
 			return nullptr;\
+		}\
 		auto result = cuda.p ## FUNC(__VA_ARGS__);\
 		if (result!=CUDA_SUCCESS)\
+		{\
+			logger.log("CCUDAHandler: %s failed with CUDA error code %d.",system::ILogger::ELL_ERROR,#FUNC,static_cast<int>(result));\
 			return nullptr;\
+		}\
 	}
 	
 	SAFE_CUDA_CALL(cuInit,0)
 				
 	int cudaVersion = 0;
 	SAFE_CUDA_CALL(cuDriverGetVersion,&cudaVersion)
-	if (cudaVersion<13000)
+	if (cudaVersion<cuda_native::MinimumCUDADriverVersion)
+	{
+		logger.log(
+			"CCUDAHandler: CUDA driver runtime %d.%d is below required %d.%d.",
+			system::ILogger::ELL_ERROR,
+			cudaVersionMajor(cudaVersion),cudaVersionMinor(cudaVersion),
+			cudaVersionMajor(cuda_native::MinimumCUDADriverVersion),cudaVersionMinor(cuda_native::MinimumCUDADriverVersion)
+		);
 		return nullptr;
+	}
 
 	// stop the pollution
 	#undef SAFE_CUDA_CALL
@@ -521,11 +552,26 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 
 	// check nvrtc existence and compatibility
 	if (!nvrtc.pnvrtcVersion)
+	{
+		logger.log("CCUDAHandler: NVRTC runtime was not found. Need NVRTC %d.x or newer.",system::ILogger::ELL_ERROR,cuda_native::MinimumNVRTCMajorVersion);
 		return nullptr;
+	}
 	int nvrtcVersion[2] = { -1,-1 };
-	nvrtc.pnvrtcVersion(nvrtcVersion+0,nvrtcVersion+1);
-	if (nvrtcVersion[0]<9)
+	const auto nvrtcVersionResult = nvrtc.pnvrtcVersion(nvrtcVersion+0,nvrtcVersion+1);
+	if (nvrtcVersionResult!=NVRTC_SUCCESS)
+	{
+		logger.log("CCUDAHandler: nvrtcVersion failed with NVRTC error code %d.",system::ILogger::ELL_ERROR,static_cast<int>(nvrtcVersionResult));
 		return nullptr;
+	}
+	if (nvrtcVersion[0]<cuda_native::MinimumNVRTCMajorVersion)
+	{
+		logger.log(
+			"CCUDAHandler: NVRTC runtime %d.%d is below required %d.x.",
+			system::ILogger::ELL_ERROR,
+			nvrtcVersion[0],nvrtcVersion[1],cuda_native::MinimumNVRTCMajorVersion
+		);
+		return nullptr;
+	}
 
 	// add headers
 	core::vector<core::smart_refctd_ptr<system::IFile>> headers;
diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
index 2dc3c3bbca..47701359ba 100644
--- a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
+++ b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
@@ -57,16 +57,6 @@ struct CCUDAImportedSemaphore::SNativeState
 namespace cuda_native
 {
 
-inline CUmemLocationType toNative(ECUDAMemoryLocation location)
-{
-	return static_cast<CUmemLocationType>(static_cast<uint32_t>(location));
-}
-
-inline ECUDAMemoryLocation toNabla(CUmemLocationType location)
-{
-	return static_cast<ECUDAMemoryLocation>(static_cast<uint32_t>(location));
-}
-
 inline CUmemAllocationHandleType getAllocationHandleType()
 {
 #ifdef _WIN32
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 407b5e81b3..cf3a89cdd1 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -1,32 +1,12 @@
 # CUDA Interop Targets
 
-- `Nabla::Nabla` does not require the CUDA SDK.
-- `Nabla::Nabla` provides Nabla CUDA interop types when the package was built with CUDA support.
-- Nabla CUDA interop public headers do not include `cuda.h` or `nvrtc.h`.
-- `Nabla::ext::CUDAInterop` is the raw CUDA Driver API and NVRTC opt-in target.
+- `Nabla::Nabla` owns the CUDA interop implementation and exported symbols.
+- `Nabla::Nabla` public headers do not include `cuda.h` or `nvrtc.h`.
+- `Nabla::ext::CUDAInterop` is the explicit raw CUDA Driver API and NVRTC opt-in target.
 - `Nabla::ext::CUDAInterop` requires `CUDAToolkit` and exposes `CUDAInteropNative.h`.
 - Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInterop`.
-- Consumers can build native CUDA code against a compatible local SDK without rebuilding Nabla.
-- Changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`.
 - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`.
 
-## Design
-
-- CUDA is used privately while building `Nabla::Nabla`.
-- CUDA SDK headers become visible to consumers only through `Nabla::ext::CUDAInterop`.
-- `Nabla::Nabla` exposes Nabla concepts such as devices, exported memory, imported memory, and imported semaphores.
-- `Nabla::ext::CUDAInterop` exposes CUDA types such as `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, external semaphores, and NVRTC objects.
-- The dependency shape follows the same general model used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header.
-- This avoids a transitive public compile-time dependency on CUDA from `Nabla::Nabla`.
-
-## OpenCV Reference
-
-- OpenCV's common CUDA header includes OpenCV headers, not raw CUDA SDK headers: [`cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda.hpp#L51-L52).
-- OpenCV keeps the public stream type as an OpenCV abstraction and grants access through `StreamAccessor`: [`cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda.hpp#L916-L979).
-- OpenCV's raw CUDA opt-in header says it is the only header that depends on the CUDA Runtime API, then includes `<cuda_runtime.h>` and exposes accessor types: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79).
-- OpenCV also keeps implementation CUDA headers private and includes `<cuda.h>` / `<cuda_runtime.h>` there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61).
-- The same split is used here: Nabla CUDA objects stay in `Nabla::Nabla`, and raw CUDA handles/functions are available only after including `CUDAInteropNative.h` and linking `Nabla::ext::CUDAInterop`.
-
 ## Usage
 
 ```cmake
@@ -35,18 +15,39 @@ target_link_libraries(app PRIVATE Nabla::Nabla)
 ```
 
 ```cmake
-find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInterop)
+find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
 target_link_libraries(native_app PRIVATE Nabla::ext::CUDAInterop)
 ```
 
+```cpp
+#include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
+
+auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
+    .size = size,
+    .alignment = alignment,
+    .location = CU_MEM_LOCATION_TYPE_DEVICE,
+});
+```
+
 ## Properties
 
-- `Nabla::Nabla` can be built with CUDA support without making CUDA SDK headers a public compile-time requirement.
-- Consumers that only link `Nabla::Nabla` do not need a CUDA SDK to parse Nabla headers.
-- Consumers that need raw CUDA include `CUDAInteropNative.h` and link `Nabla::ext::CUDAInterop` explicitly.
-- Raw CUDA access is not wrapped away. Native code can use CUDA Driver API types, NVRTC types, and Nabla native accessors in the opt-in path.
+- Consumers that only link `Nabla::Nabla` do not need CUDA SDK headers to parse Nabla headers.
+- Consumers that need raw CUDA include `CUDAInteropNative.h` and link `Nabla::ext::CUDAInterop`.
+- Raw CUDA access is not wrapped away in the native opt-in path. Native code uses CUDA Driver API and NVRTC types directly.
+- CUDA SDK structs with version-sensitive layout are kept out of exported Nabla ABI.
+- The exported native ABI uses stable CUDA Driver API handles/enums and small Nabla-owned parameter structs.
+- A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla.
+- `CCUDAHandler::create` validates the loaded CUDA driver and NVRTC runtime. It returns `nullptr` when the runtime is missing or below the required CUDA 13.0 / NVRTC 13.x floor.
 - The Nabla source list is stable. CUDA interop `.cpp` files stay visible in IDE projects for CUDA ON and CUDA OFF builds.
-- CUDA OFF implementations are local stubs in the same `.cpp` files. Clean API entry points stay linkable and return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
+- CUDA OFF implementations are local stubs in the same `.cpp` files. SDK-free API entry points stay linkable and return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
 - CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`, so CUDA OFF builds do not need `cuda.h` or `nvrtc.h`.
-- A package built with CUDA support can be consumed without a local CUDA SDK unless the `CUDAInterop` component is requested.
-- A consumer can use a compatible local CUDA SDK for native interop without rebuilding Nabla.
+
+## Related Designs
+
+- OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79).
+- OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61).
+- Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: [`device.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27).
+- Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: [`device_impl.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30), [`device.cpp`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.cpp#L10-L48).
+- ONNX Runtime keeps accelerator dependencies behind execution providers and supports provider shared libraries loaded only when requested: [`Build with Execution Providers`](https://onnxruntime.ai/docs/build/eps.html#execution-provider-shared-libraries).
+- ggml/llama.cpp keeps the generic backend API separate from CUDA and builds CUDA as an explicit backend target with CUDA libraries linked to that backend: [`ggml-backend.h`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/include/ggml-backend.h#L1488-L1499), [`ggml-cuda CMakeLists.txt`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-cuda/CMakeLists.txt#L982-L1072).
+- TensorFlow PluggableDevice uses separate device plugin packages so accelerator toolchains and dependencies do not become core TensorFlow requirements: [`PluggableDevice`](https://blog.tensorflow.org/2021/06/pluggabledevice-device-plugins-for-TensorFlow.html).
diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
index 348caa766e..e36fe65701 100644
--- a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
@@ -23,14 +23,11 @@ class CUDAInteropCleanOptInSmoke final : public nbl::system::IApplicationFramewo
 
 	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override
 	{
-		static_assert(std::is_same_v<decltype(nbl::video::CCUDAExportableMemory::SCreationParams{}.location), nbl::video::ECUDAMemoryLocation>);
-
-		const nbl::video::CCUDAExportableMemory::SCreationParams params = {
-			.size = 4096,
-			.alignment = 4096,
-			.location = nbl::video::ECUDAMemoryLocation::DEVICE,
-		};
-		return isAPILoaded() && params.location==nbl::video::ECUDAMemoryLocation::DEVICE;
+		static_assert(std::is_class_v<nbl::video::CCUDADevice>);
+		static_assert(std::is_class_v<nbl::video::CCUDAExportableMemory>);
+		static_assert(std::is_class_v<nbl::video::CCUDAImportedMemory>);
+		static_assert(std::is_class_v<nbl::video::CCUDAImportedSemaphore>);
+		return isAPILoaded();
 	}
 
 	void workLoopBody() override {}
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index a78f710040..6dda3d275e 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -22,10 +22,10 @@ using namespace nbl::video;
 	core::smart_refctd_ptr<IDeviceMemoryAllocation> vulkanMemory,
 	core::smart_refctd_ptr<ISemaphore> vulkanSemaphore)
 {
-	auto cudaMemory = cudaDevice.createExportableMemory({
+	auto cudaMemory = cuda_native::createExportableMemory(cudaDevice, {
 		.size = 4096,
 		.alignment = 4096,
-		.location = ECUDAMemoryLocation::DEVICE,
+		.location = CU_MEM_LOCATION_TYPE_DEVICE,
 	});
 	if (!cudaMemory)
 		return false;

From 141790523f61caa5fbbf45223ba4cfa0bade78c9 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 11:58:20 +0200
Subject: [PATCH 16/51] Add CUDA interop runtime header discovery

---
 CMakeLists.txt                                |   1 +
 cmake/NablaCUDAInteropHelpers.cmake           | 182 ++++++++++
 cmake/NablaConfig.cmake.in                    |   3 +
 examples_tests                                |   2 +-
 include/nbl/ext/CUDAInterop/CCUDAHandler.h    |  16 +
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      | 331 +++++++++++++++++-
 src/nbl/ext/CUDAInterop/CMakeLists.txt        |  19 +-
 src/nbl/ext/CUDAInterop/README.md             |  48 ++-
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt  |   8 +-
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |  42 +++
 10 files changed, 641 insertions(+), 11 deletions(-)
 create mode 100644 cmake/NablaCUDAInteropHelpers.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 14845789fc..9251a3ee68 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -312,6 +312,7 @@ if(NBL_ENABLE_CONFIG_INSTALL)
 		set(_NBL_NABLA_CONFIG_FILES
 			"${CMAKE_CURRENT_BINARY_DIR}/NablaConfig.cmake"
 			"${CMAKE_CURRENT_BINARY_DIR}/NablaConfigVersion.cmake"
+			"${CMAKE_CURRENT_LIST_DIR}/cmake/NablaCUDAInteropHelpers.cmake"
 		)
 
 		install(EXPORT NablaExportTargets
diff --git a/cmake/NablaCUDAInteropHelpers.cmake b/cmake/NablaCUDAInteropHelpers.cmake
new file mode 100644
index 0000000000..6486789aeb
--- /dev/null
+++ b/cmake/NablaCUDAInteropHelpers.cmake
@@ -0,0 +1,182 @@
+function(_nbl_cuda_interop_collect_runtime_include_dirs _OUT_INCLUDE_DIRS)
+	set(_include_dirs ${ARGN})
+
+	if(DEFINED CUDAToolkit_INCLUDE_DIRS AND NOT "${CUDAToolkit_INCLUDE_DIRS}" STREQUAL "")
+		list(APPEND _include_dirs ${CUDAToolkit_INCLUDE_DIRS})
+	endif()
+
+	if(TARGET CUDA::toolkit)
+		get_target_property(_cuda_toolkit_include_dirs CUDA::toolkit INTERFACE_INCLUDE_DIRECTORIES)
+		if(_cuda_toolkit_include_dirs AND NOT _cuda_toolkit_include_dirs STREQUAL "NOTFOUND")
+			list(APPEND _include_dirs ${_cuda_toolkit_include_dirs})
+		endif()
+	endif()
+
+	if(_include_dirs)
+		list(REMOVE_DUPLICATES _include_dirs)
+	endif()
+
+	set(${_OUT_INCLUDE_DIRS} ${_include_dirs} PARENT_SCOPE)
+endfunction()
+
+function(_nbl_cuda_interop_make_runtime_paths_json _OUT_CONTENT)
+	set(_include_dirs ${ARGN})
+	set(_json "{\n  \"cudaRuntimeIncludeDirs\": [")
+	set(_first ON)
+
+	foreach(_include_dir IN LISTS _include_dirs)
+		if("${_include_dir}" STREQUAL "")
+			continue()
+		endif()
+
+		file(TO_CMAKE_PATH "${_include_dir}" _include_dir_json)
+		string(REPLACE "\"" "\\\"" _include_dir_json "${_include_dir_json}")
+
+		if(_first)
+			string(APPEND _json "\n")
+			set(_first OFF)
+		else()
+			string(APPEND _json ",\n")
+		endif()
+		string(APPEND _json "    \"${_include_dir_json}\"")
+	endforeach()
+
+	if(NOT _first)
+		string(APPEND _json "\n  ]\n}\n")
+	else()
+		string(APPEND _json "]\n}\n")
+	endif()
+
+	set(${_OUT_CONTENT} "${_json}" PARENT_SCOPE)
+endfunction()
+
+function(_nbl_cuda_interop_collect_configs _OUT_CONFIGS)
+	if(CMAKE_CONFIGURATION_TYPES)
+		set(_configs ${CMAKE_CONFIGURATION_TYPES})
+	elseif(CMAKE_BUILD_TYPE)
+		set(_configs "${CMAKE_BUILD_TYPE}")
+	else()
+		set(_configs Debug)
+	endif()
+
+	list(REMOVE_DUPLICATES _configs)
+	set(${_OUT_CONFIGS} ${_configs} PARENT_SCOPE)
+endfunction()
+
+function(_nbl_cuda_interop_collect_target_runtime_jsons TARGET_NAME _OUT_FILES _OVERRIDE_OUTPUT)
+	_nbl_cuda_interop_collect_configs(_configs)
+	set(_runtime_jsons "")
+
+	if(NOT "${_OVERRIDE_OUTPUT}" STREQUAL "")
+		foreach(_config IN LISTS _configs)
+			set(_runtime_paths_json "${_OVERRIDE_OUTPUT}")
+			string(REPLACE "$<CONFIG>" "${_config}" _runtime_paths_json "${_runtime_paths_json}")
+			if(_runtime_paths_json MATCHES "\\$<")
+				message(FATAL_ERROR "Nabla: CUDA interop runtime JSON path supports only plain paths or $<CONFIG>.")
+			endif()
+			cmake_path(IS_ABSOLUTE _runtime_paths_json _is_abs)
+			if(NOT _is_abs)
+				cmake_path(ABSOLUTE_PATH _runtime_paths_json BASE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" OUTPUT_VARIABLE _runtime_paths_json)
+			endif()
+			cmake_path(NORMAL_PATH _runtime_paths_json OUTPUT_VARIABLE _runtime_paths_json)
+			list(APPEND _runtime_jsons "${_runtime_paths_json}")
+		endforeach()
+		list(REMOVE_DUPLICATES _runtime_jsons)
+		set(${_OUT_FILES} ${_runtime_jsons} PARENT_SCOPE)
+		return()
+	endif()
+
+	foreach(_config IN LISTS _configs)
+		string(TOUPPER "${_config}" _config_upper)
+		get_target_property(_runtime_output_dir "${TARGET_NAME}" "RUNTIME_OUTPUT_DIRECTORY_${_config_upper}")
+
+		if(NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND")
+			get_target_property(_runtime_output_dir "${TARGET_NAME}" RUNTIME_OUTPUT_DIRECTORY)
+		endif()
+		if((NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") AND DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY_${_config_upper})
+			set(_runtime_output_dir "${CMAKE_RUNTIME_OUTPUT_DIRECTORY_${_config_upper}}")
+		endif()
+		if((NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") AND DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY)
+			set(_runtime_output_dir "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
+		endif()
+		if(NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND")
+			if(CMAKE_CONFIGURATION_TYPES)
+				set(_runtime_output_dir "${CMAKE_CURRENT_BINARY_DIR}/${_config}")
+			else()
+				set(_runtime_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
+			endif()
+		endif()
+
+		string(REPLACE "$<CONFIG>" "${_config}" _runtime_output_dir "${_runtime_output_dir}")
+		if(_runtime_output_dir MATCHES "\\$<")
+			message(FATAL_ERROR "Nabla: nbl_configure_cuda_interop_runtime supports only plain runtime output directories or $<CONFIG>.")
+		endif()
+
+		cmake_path(IS_ABSOLUTE _runtime_output_dir _is_abs)
+		if(NOT _is_abs)
+			cmake_path(ABSOLUTE_PATH _runtime_output_dir BASE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" OUTPUT_VARIABLE _runtime_output_dir)
+		endif()
+		cmake_path(NORMAL_PATH _runtime_output_dir OUTPUT_VARIABLE _runtime_output_dir)
+
+		list(APPEND _runtime_jsons "${_runtime_output_dir}/nbl_cuda_interop_runtime.json")
+	endforeach()
+
+	list(REMOVE_DUPLICATES _runtime_jsons)
+	set(${_OUT_FILES} ${_runtime_jsons} PARENT_SCOPE)
+endfunction()
+
+function(nbl_configure_cuda_interop_runtime TARGET_NAME)
+	cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${ARGN})
+
+	if(_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS)
+		message(FATAL_ERROR "Nabla: unexpected arguments for nbl_configure_cuda_interop_runtime: ${_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS}")
+	endif()
+
+	if(NOT TARGET "${TARGET_NAME}")
+		message(FATAL_ERROR "Nabla: target \"${TARGET_NAME}\" does not exist")
+	endif()
+
+	_nbl_cuda_interop_collect_runtime_include_dirs(_include_dirs ${_NBL_CUDA_INTEROP_INCLUDE_DIRS})
+
+	_nbl_cuda_interop_make_runtime_paths_json(_runtime_paths_json_content ${_include_dirs})
+	_nbl_cuda_interop_collect_target_runtime_jsons("${TARGET_NAME}" _runtime_paths_jsons "${_NBL_CUDA_INTEROP_RUNTIME_JSON}")
+
+	foreach(_runtime_paths_json IN LISTS _runtime_paths_jsons)
+		file(GENERATE OUTPUT "${_runtime_paths_json}" CONTENT "${_runtime_paths_json_content}" TARGET "${TARGET_NAME}")
+	endforeach()
+
+	set_source_files_properties(${_runtime_paths_jsons} PROPERTIES GENERATED TRUE HEADER_FILE_ONLY TRUE)
+	target_sources("${TARGET_NAME}" PRIVATE ${_runtime_paths_jsons})
+endfunction()
+
+function(nbl_target_link_cuda_interop TARGET_NAME)
+	set(_args ${ARGN})
+	set(_scope PRIVATE)
+
+	if(_args)
+		list(GET _args 0 _first_arg)
+		if(_first_arg MATCHES "^(PRIVATE|PUBLIC|INTERFACE)$")
+			set(_scope "${_first_arg}")
+			list(REMOVE_AT _args 0)
+		endif()
+	endif()
+
+	cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${_args})
+
+	if(_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS)
+		message(FATAL_ERROR "Nabla: unexpected arguments for nbl_target_link_cuda_interop: ${_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS}")
+	endif()
+
+	if(NOT TARGET "${TARGET_NAME}")
+		message(FATAL_ERROR "Nabla: target \"${TARGET_NAME}\" does not exist")
+	endif()
+	if(NOT TARGET Nabla::ext::CUDAInterop)
+		message(FATAL_ERROR "Nabla: Nabla::ext::CUDAInterop is not available. Request the CUDAInterop package component or enable NBL_COMPILE_WITH_CUDA.")
+	endif()
+
+	target_link_libraries("${TARGET_NAME}" ${_scope} Nabla::ext::CUDAInterop)
+	nbl_configure_cuda_interop_runtime("${TARGET_NAME}"
+		RUNTIME_JSON "${_NBL_CUDA_INTEROP_RUNTIME_JSON}"
+		INCLUDE_DIRS ${_NBL_CUDA_INTEROP_INCLUDE_DIRS}
+	)
+endfunction()
diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in
index 8b9f62e548..0464340ce3 100644
--- a/cmake/NablaConfig.cmake.in
+++ b/cmake/NablaConfig.cmake.in
@@ -96,6 +96,9 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP)
   _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND)
   if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop)
     target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit)
+    if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake")
+      include("${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake")
+    endif()
   endif()
 endif()
 
diff --git a/examples_tests b/examples_tests
index fbb82d36e0..b2c639c8b7 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit fbb82d36e0f767e867a477a9d1a7035c7cbd56ca
+Subproject commit b2c639c8b71c3b860418dc4b3e46ad147ba5f256
diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
index 6a3cc6c496..bed4f9a31c 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
@@ -8,6 +8,7 @@
 #include "nbl/core/definitions.h"
 
 #include "nbl/system/declarations.h"
+#include "nbl/system/path.h"
 
 #include <array>
 #include <cstdint>
@@ -25,6 +26,21 @@ namespace cuda_native
 struct SAccess;
 }
 
+namespace cuda_interop
+{
+inline constexpr const char* RuntimePathsFileName = "nbl_cuda_interop_runtime.json";
+
+struct SRuntimeCompileEnvironment
+{
+	core::vector<system::path> includeDirs;
+	core::vector<system::path> runtimePathFiles;
+};
+
+NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs = {});
+NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs, core::vector<system::path> runtimePathFiles);
+NBL_API2 core::vector<std::string> makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment);
+}
+
 class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 {
 	public:
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 777a1db14a..fce7fd2b5a 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -3,6 +3,324 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 #include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/system/ModuleLookupUtils.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <fstream>
+#include <sstream>
+#include <string_view>
+#include <system_error>
+
+namespace nbl::video::cuda_interop
+{
+namespace
+{
+
+std::string readEnvironmentVariable(std::string_view name)
+{
+	#if defined(_NBL_PLATFORM_WINDOWS_)
+	char* value = nullptr;
+	size_t size = 0;
+	if (_dupenv_s(&value,&size,std::string(name).c_str()) || !value)
+		return {};
+	std::string result(value);
+	std::free(value);
+	return result;
+	#else
+	if (const char* value = std::getenv(std::string(name).c_str()))
+		return value;
+	return {};
+	#endif
+}
+
+bool isDirectory(const system::path& path)
+{
+	std::error_code error;
+	return std::filesystem::exists(path,error) && std::filesystem::is_directory(path,error);
+}
+
+bool isRegularFile(const system::path& path)
+{
+	std::error_code error;
+	return std::filesystem::exists(path,error) && std::filesystem::is_regular_file(path,error);
+}
+
+system::path normalizedAbsolute(system::path path)
+{
+	std::error_code error;
+	auto absolute = std::filesystem::absolute(path,error);
+	if (error)
+		absolute = std::move(path);
+	return absolute.lexically_normal();
+}
+
+bool looksLikeCUDAIncludeDir(const system::path& path)
+{
+	if (!isDirectory(path))
+		return false;
+
+	return isRegularFile(path/"cuda_fp16.h") ||
+		isRegularFile(path/"cuda_runtime_api.h") ||
+		isRegularFile(path/"vector_types.h") ||
+		isRegularFile(path/"cuda.h") ||
+		isRegularFile(path/"nv"/"target");
+}
+
+void appendIncludeDir(core::vector<system::path>& includeDirs, system::path path)
+{
+	if (path.empty() || !looksLikeCUDAIncludeDir(path))
+		return;
+
+	path = normalizedAbsolute(std::move(path));
+	const auto pathString = path.generic_string();
+	const auto alreadyAdded = std::find_if(includeDirs.begin(),includeDirs.end(),[&](const system::path& existing) {
+		return existing.generic_string()==pathString;
+	});
+	if (alreadyAdded==includeDirs.end())
+		includeDirs.push_back(std::move(path));
+}
+
+void appendCUDAIncludeRoot(core::vector<system::path>& includeDirs, const system::path& root)
+{
+	if (root.empty())
+		return;
+
+	appendIncludeDir(includeDirs,root);
+	appendIncludeDir(includeDirs,root/"include");
+}
+
+core::vector<std::string> parseStringArray(std::string_view text, std::string_view key)
+{
+	core::vector<std::string> values;
+	const std::string quotedKey = "\"" + std::string(key) + "\"";
+	const auto keyPos = text.find(quotedKey);
+	if (keyPos==std::string_view::npos)
+		return values;
+
+	const auto arrayBegin = text.find('[',keyPos+quotedKey.size());
+	if (arrayBegin==std::string_view::npos)
+		return values;
+	const auto arrayEnd = text.find(']',arrayBegin+1);
+	if (arrayEnd==std::string_view::npos)
+		return values;
+
+	for (auto pos = arrayBegin+1; pos<arrayEnd;)
+	{
+		const auto quoteBegin = text.find('"',pos);
+		if (quoteBegin==std::string_view::npos || quoteBegin>=arrayEnd)
+			break;
+
+		std::string value;
+		auto cursor = quoteBegin+1;
+		for (; cursor<arrayEnd; ++cursor)
+		{
+			const char c = text[cursor];
+			if (c=='\\')
+			{
+				if (++cursor<arrayEnd)
+					value.push_back(text[cursor]);
+				continue;
+			}
+			if (c=='"')
+				break;
+			value.push_back(c);
+		}
+
+		values.push_back(std::move(value));
+		pos = cursor+1;
+	}
+
+	return values;
+}
+
+void appendRuntimePathsConfig(core::vector<system::path>& includeDirs, const system::path& configFile)
+{
+	if (!isRegularFile(configFile))
+		return;
+
+	std::ifstream input(configFile);
+	if (!input)
+		return;
+
+	std::stringstream buffer;
+	buffer << input.rdbuf();
+	for (const auto& path : parseStringArray(buffer.str(),"cudaRuntimeIncludeDirs"))
+		appendIncludeDir(includeDirs,system::path(path));
+}
+
+void appendRuntimePathsConfigEnv(core::vector<system::path>& includeDirs, std::string_view name)
+{
+	const auto value = readEnvironmentVariable(name);
+	if (value.empty())
+		return;
+
+	#if defined(_NBL_PLATFORM_WINDOWS_)
+	constexpr char Separator = ';';
+	#else
+	constexpr char Separator = ':';
+	#endif
+
+	size_t begin = 0;
+	while (begin<value.size())
+	{
+		const auto end = value.find(Separator,begin);
+		const auto segment = value.substr(begin,end==std::string::npos ? std::string::npos:end-begin);
+		appendRuntimePathsConfig(includeDirs,system::path(segment));
+		if (end==std::string::npos)
+			break;
+		begin = end+1;
+	}
+}
+
+void appendRuntimePathsConfigs(core::vector<system::path>& includeDirs, const core::vector<system::path>& explicitRuntimePathFiles)
+{
+	for (const auto& runtimePathFile : explicitRuntimePathFiles)
+		appendRuntimePathsConfig(includeDirs,runtimePathFile);
+
+	appendRuntimePathsConfigEnv(includeDirs,"NBL_CUDA_INTEROP_RUNTIME_JSON");
+	appendRuntimePathsConfigEnv(includeDirs,"Nabla_CUDA_INTEROP_RUNTIME_JSON");
+
+	const auto exeDir = system::executableDirectory();
+	if (!exeDir.empty())
+		appendRuntimePathsConfig(includeDirs,exeDir/RuntimePathsFileName);
+
+	#if defined(_NBL_PLATFORM_WINDOWS_)
+	const auto releaseModuleDir = system::loadedModuleDirectory("Nabla.dll");
+	if (!releaseModuleDir.empty())
+		appendRuntimePathsConfig(includeDirs,releaseModuleDir/RuntimePathsFileName);
+	const auto debugModuleDir = system::loadedModuleDirectory("Nabla_debug.dll");
+	if (!debugModuleDir.empty())
+		appendRuntimePathsConfig(includeDirs,debugModuleDir/RuntimePathsFileName);
+	#endif
+}
+
+void appendAppLocalIncludeDirs(core::vector<system::path>& includeDirs)
+{
+	const auto exeDir = system::executableDirectory();
+	if (exeDir.empty())
+		return;
+
+	appendIncludeDir(includeDirs,exeDir/"cuda"/"include");
+	appendIncludeDir(includeDirs,exeDir/"nvidia"/"cu13"/"include");
+	appendIncludeDir(includeDirs,exeDir/"Libraries"/"cuda"/"include");
+	appendIncludeDir(includeDirs,exeDir.parent_path()/"cuda"/"include");
+}
+
+void appendPythonPackageIncludeDirs(core::vector<system::path>& includeDirs, const system::path& root)
+{
+	if (root.empty())
+		return;
+
+	appendIncludeDir(includeDirs,root/"Lib"/"site-packages"/"nvidia"/"cu13"/"include");
+	appendIncludeDir(includeDirs,root/"lib"/"site-packages"/"nvidia"/"cu13"/"include");
+	appendIncludeDir(includeDirs,root/"Library"/"include");
+	appendIncludeDir(includeDirs,root/"include");
+}
+
+void appendPathListEnv(core::vector<system::path>& includeDirs, std::string_view name)
+{
+	const auto value = readEnvironmentVariable(name);
+	if (value.empty())
+		return;
+
+	#if defined(_NBL_PLATFORM_WINDOWS_)
+	constexpr char Separator = ';';
+	#else
+	constexpr char Separator = ':';
+	#endif
+
+	size_t begin = 0;
+	while (begin<value.size())
+	{
+		const auto end = value.find(Separator,begin);
+		const auto segment = value.substr(begin,end==std::string::npos ? std::string::npos:end-begin);
+		appendIncludeDir(includeDirs,system::path(segment));
+		if (end==std::string::npos)
+			break;
+		begin = end+1;
+	}
+}
+
+void appendEnvironmentIncludeDirs(core::vector<system::path>& includeDirs)
+{
+	appendPathListEnv(includeDirs,"NBL_CUDA_RUNTIME_INCLUDE_DIRS");
+	appendPathListEnv(includeDirs,"Nabla_CUDA_RUNTIME_INCLUDE_DIRS");
+
+	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_PATH"));
+	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_HOME"));
+	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_ROOT"));
+	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDAToolkit_ROOT"));
+
+	appendPythonPackageIncludeDirs(includeDirs,readEnvironmentVariable("VIRTUAL_ENV"));
+	appendPythonPackageIncludeDirs(includeDirs,readEnvironmentVariable("CONDA_PREFIX"));
+}
+
+void appendCUDAInstallRoots(core::vector<system::path>& includeDirs, const system::path& root)
+{
+	if (!isDirectory(root))
+		return;
+
+	core::vector<system::path> candidates;
+	std::error_code error;
+	for (const auto& entry : std::filesystem::directory_iterator(root,error))
+	{
+		if (error)
+			break;
+		if (!entry.is_directory(error))
+			continue;
+		candidates.push_back(entry.path()/"include");
+	}
+
+	std::sort(candidates.begin(),candidates.end(),[](const system::path& lhs, const system::path& rhs) {
+		return lhs.generic_string()>rhs.generic_string();
+	});
+	for (const auto& candidate : candidates)
+		appendIncludeDir(includeDirs,candidate);
+}
+
+void appendSystemIncludeDirs(core::vector<system::path>& includeDirs)
+{
+	#if defined(_NBL_PLATFORM_WINDOWS_)
+	appendCUDAInstallRoots(includeDirs,"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA");
+	#else
+	appendIncludeDir(includeDirs,"/usr/local/cuda/include");
+	appendCUDAInstallRoots(includeDirs,"/usr/local");
+	appendIncludeDir(includeDirs,"/usr/include");
+	#endif
+}
+
+}
+
+SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs, core::vector<system::path> runtimePathFiles)
+{
+	SRuntimeCompileEnvironment environment;
+	environment.runtimePathFiles = std::move(runtimePathFiles);
+	for (auto& includeDir : explicitIncludeDirs)
+		appendIncludeDir(environment.includeDirs,std::move(includeDir));
+
+	appendRuntimePathsConfigs(environment.includeDirs,environment.runtimePathFiles);
+	appendAppLocalIncludeDirs(environment.includeDirs);
+	appendEnvironmentIncludeDirs(environment.includeDirs);
+	appendSystemIncludeDirs(environment.includeDirs);
+
+	return environment;
+}
+
+SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs)
+{
+	return findRuntimeCompileEnvironment(std::move(explicitIncludeDirs),{});
+}
+
+core::vector<std::string> makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment)
+{
+	core::vector<std::string> options;
+	for (const auto& includeDir : environment.includeDirs)
+		options.push_back("-I" + includeDir.generic_string());
+	return options;
+}
+
+}
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #include "CUDAInteropNativeState.hpp"
@@ -671,7 +989,18 @@ static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nv
 	if (result!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
-	result = compileProgram(handler,program,nvrtcOptions);
+	const auto runtimeEnvironment = cuda_interop::findRuntimeCompileEnvironment();
+	const auto runtimeIncludeOptions = cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment);
+	core::vector<const char*> options;
+	options.reserve(nvrtcOptions.size()+runtimeIncludeOptions.size());
+	for (const auto option : nvrtcOptions)
+		options.push_back(option);
+	for (const auto& option : runtimeIncludeOptions)
+		options.push_back(option.c_str());
+
+	const auto* optionsBegin = options.empty() ? nullptr:options.data();
+	const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size();
+	result = compileProgram(handler,program,{optionsBegin,optionsEnd});
 	if (log)
 		getProgramLog(handler,program,*log);
 	if (result!=NVRTC_SUCCESS)
diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt
index 438ab51d8f..a9e1663fa9 100644
--- a/src/nbl/ext/CUDAInterop/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt
@@ -1,13 +1,22 @@
-include(${NBL_ROOT_PATH}/cmake/common.cmake)
+include(common)
+include(NablaCUDAInteropHelpers)
 
 if (NBL_COMPILE_WITH_CUDA)
 	set(NBL_EXT_CUDA_INTEROP_LIB "NblExtCUDA_INTEROP")
 
-	add_library(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE)
+	file(GLOB NBL_EXT_CUDA_INTEROP_IDE_HEADERS CONFIGURE_DEPENDS "${NBL_ROOT_PATH}/include/nbl/ext/CUDAInterop/*.h")
+	set(NBL_EXT_CUDA_INTEROP_IDE_SOURCES
+		${NBL_EXT_CUDA_INTEROP_IDE_HEADERS}
+		CMakeLists.txt
+		README.md
+	)
+	set_source_files_properties(${NBL_EXT_CUDA_INTEROP_IDE_SOURCES} PROPERTIES HEADER_FILE_ONLY TRUE)
+
+	# Header-only opt-in target. It builds no artifact and adds CUDA SDK usage requirements only for native interop consumers.
+	add_library(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE EXCLUDE_FROM_ALL ${NBL_EXT_CUDA_INTEROP_IDE_SOURCES})
 	target_link_libraries(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE
-		$<BUILD_INTERFACE:Nabla>
-		$<BUILD_INTERFACE:CUDA::toolkit>
-		$<INSTALL_INTERFACE:Nabla::Nabla>
+		Nabla
+		CUDA::toolkit
 	)
 	set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInterop")
 	add_library(Nabla::ext::CUDAInterop ALIAS ${NBL_EXT_CUDA_INTEROP_LIB})
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index cf3a89cdd1..837f3ab28e 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -2,8 +2,12 @@
 
 - `Nabla::Nabla` owns the CUDA interop implementation and exported symbols.
 - `Nabla::Nabla` public headers do not include `cuda.h` or `nvrtc.h`.
+- The SDK-free interop headers stay stable for CUDA ON and CUDA OFF Nabla builds.
 - `Nabla::ext::CUDAInterop` is the explicit raw CUDA Driver API and NVRTC opt-in target.
+- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It does not build a library or executable artifact.
+- The target only carries usage requirements and IDE-visible sources.
 - `Nabla::ext::CUDAInterop` requires `CUDAToolkit` and exposes `CUDAInteropNative.h`.
+- `CUDAInteropNative.h` is the small opt-in header that includes CUDA SDK headers such as `cuda.h` and `nvrtc.h`.
 - Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInterop`.
 - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`.
 
@@ -16,12 +20,28 @@ target_link_libraries(app PRIVATE Nabla::Nabla)
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
-target_link_libraries(native_app PRIVATE Nabla::ext::CUDAInterop)
+nbl_target_link_cuda_interop(native_app PRIVATE)
+```
+
+```cmake
+find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
+nbl_target_link_cuda_interop(native_app PRIVATE
+    INCLUDE_DIRS "${cuda_runtime_headers}"
+)
+```
+
+```cmake
+nbl_target_link_cuda_interop(native_app PRIVATE
+    RUNTIME_JSON "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/my_cuda_runtime.json"
+)
 ```
 
 ```cpp
 #include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
 
+auto runtimeEnv = nbl::video::cuda_interop::findRuntimeCompileEnvironment();
+auto includeOptions = nbl::video::cuda_interop::makeNVRTCIncludeOptions(runtimeEnv);
+
 auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
     .size = size,
     .alignment = alignment,
@@ -29,6 +49,23 @@ auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
 });
 ```
 
+## Runtime Header Discovery
+
+- `nbl_target_link_cuda_interop(<target> <scope>)` links `Nabla::ext::CUDAInterop` and configures runtime include discovery for that target.
+- The helper is defined once in `NablaCUDAInteropHelpers.cmake` and is available from the source tree and installed `NablaConfig.cmake`.
+- For each target it writes `nbl_cuda_interop_runtime.json` next to the executable during CMake generation.
+- `RUNTIME_JSON <path>` overrides the generated JSON location. Plain paths and `$<CONFIG>` are supported.
+- `cuda_interop::findRuntimeCompileEnvironment` can also receive explicit JSON paths at runtime.
+- `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
+- The JSON is a build artifact. Nabla packages do not install JSON files with host-specific CUDA paths.
+- Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`.
+- Runtime lookup reads `nbl_cuda_interop_runtime.json` first, then checks app-local include bundles, explicit environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
+- `cuda_native::compileDirectlyToPTX` appends discovered include directories to the NVRTC option list.
+- Production machines do not need the full CUDA SDK just because Nabla was built with CUDA.
+- If an application compiles CUDA source with NVRTC and includes headers such as `cuda_fp16.h`, it must provide those runtime headers through the generated JSON path, an app-local bundle, a runtime/header package, or an installed toolkit.
+- `CUDA_PATH` is a developer fallback. It is not required for packaged applications.
+- Direct `target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop)` remains possible, but it only adds compile/link usage requirements and does not create the runtime discovery JSON.
+
 ## Properties
 
 - Consumers that only link `Nabla::Nabla` do not need CUDA SDK headers to parse Nabla headers.
@@ -38,12 +75,17 @@ auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
 - The exported native ABI uses stable CUDA Driver API handles/enums and small Nabla-owned parameter structs.
 - A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla.
 - `CCUDAHandler::create` validates the loaded CUDA driver and NVRTC runtime. It returns `nullptr` when the runtime is missing or below the required CUDA 13.0 / NVRTC 13.x floor.
+- Runtime CUDA header discovery is independent from the CUDA SDK used to build Nabla.
+- Native consumers can use a newer compatible CUDA SDK or a runtime/header package without rebuilding Nabla.
+- Toggling Nabla CUDA support does not change SDK-free public header parse requirements for consumers.
 - The Nabla source list is stable. CUDA interop `.cpp` files stay visible in IDE projects for CUDA ON and CUDA OFF builds.
-- CUDA OFF implementations are local stubs in the same `.cpp` files. SDK-free API entry points stay linkable and return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
-- CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`, so CUDA OFF builds do not need `cuda.h` or `nvrtc.h`.
+- CUDA OFF implementations are local stubs in the same `.cpp` files. SDK-free API entry points stay linkable and factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
+- CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`.
 
 ## Related Designs
 
+This split follows the same public-boundary pattern used by mature GPU projects: SDK-free default headers, native access through an explicit opt-in path, and SDK-dependent implementation details outside the default public API.
+
 - OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79).
 - OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61).
 - Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: [`device.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27).
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index bdda95fb03..7118eeff09 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.30)
 project(NblExtCUDAInteropSmoke CXX)
 
 option(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE "Build the CUDA native opt-in smoke from an installed Nabla package." OFF)
+set(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON "" CACHE FILEPATH "Optional CUDA interop runtime JSON path used by the native smoke.")
 
 if(NOT TARGET Nabla::Nabla)
 	set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core)
@@ -28,5 +29,10 @@ target_link_libraries(NblExtCUDAInteropCleanNablaSmoke PRIVATE Nabla::Nabla)
 
 if(TARGET Nabla::ext::CUDAInterop)
 	nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp)
-	target_link_libraries(NblExtCUDAInteropNativeOptInSmoke PRIVATE Nabla::ext::CUDAInterop)
+	set(_nbl_cuda_interop_smoke_args PRIVATE)
+	if(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON)
+		list(APPEND _nbl_cuda_interop_smoke_args RUNTIME_JSON "${NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}")
+		target_compile_definitions(NblExtCUDAInteropNativeOptInSmoke PRIVATE NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON="${NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}")
+	endif()
+	nbl_target_link_cuda_interop(NblExtCUDAInteropNativeOptInSmoke ${_nbl_cuda_interop_smoke_args})
 endif()
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index 6dda3d275e..3b799a56cf 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -4,6 +4,7 @@
 #include <algorithm>
 #include <array>
 #include <cstdint>
+#include <filesystem>
 #include <type_traits>
 #include <utility>
 
@@ -82,6 +83,30 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 	releaseContext();
 	return ok && std::ranges::equal(input, output);
 }
+
+bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler)
+{
+	constexpr const char* Source = R"cuda(
+		#include <cuda_fp16.h>
+		extern "C" __global__ void fp16_probe(unsigned short* out)
+		{
+			out[0] = sizeof(__half);
+		}
+	)cuda";
+
+	std::string log;
+	auto [ptx, result] = cuda_native::compileDirectlyToPTX(
+		handler,
+		Source,
+		"cuda_fp16_discovery_probe.cu",
+		{nullptr,nullptr},
+		0,
+		nullptr,
+		nullptr,
+		&log
+	);
+	return result==NVRTC_SUCCESS && ptx && ptx->getSize()>0u;
+}
 }
 
 class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramework
@@ -98,10 +123,27 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 
 		static_assert(std::is_same_v<decltype(nbl::video::cuda_native::getInternalObject(std::declval<const nbl::video::CCUDADevice&>())), CUdevice>);
 
+		#ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON
+		const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment({}, {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON});
+		if (!std::filesystem::exists(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON))
+			return false;
+		#else
+		const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment();
+		#endif
+		const auto includeOptions = nbl::video::cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment);
+		const auto hasRuntimeHeaders = std::find_if(runtimeEnvironment.includeDirs.begin(),runtimeEnvironment.includeDirs.end(),[](const auto& includeDir) {
+			return std::filesystem::exists(includeDir/"cuda_fp16.h") || std::filesystem::exists(includeDir/"cuda_runtime_api.h");
+		})!=runtimeEnvironment.includeDirs.end();
+		if (includeOptions.empty() || !hasRuntimeHeaders)
+			return false;
+
 		auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr);
 		if (!handler)
 			return true;
 
+		if (!cudaFp16HeaderCompileProbe(*handler))
+			return false;
+
 		const auto& devices = nbl::video::cuda_native::getAvailableDevices(handler);
 		if (devices.empty())
 			return true;

From 045432e616810403aa55d1232cd57fbbcc6dc8d1 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 12:47:01 +0200
Subject: [PATCH 17/51] Tighten CUDA interop native helpers

---
 cmake/NablaCUDAInteropHelpers.cmake           |  30 +--
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 255 ++++++++----------
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      |  81 ++----
 src/nbl/ext/CUDAInterop/README.md             |  27 ++
 4 files changed, 178 insertions(+), 215 deletions(-)

diff --git a/cmake/NablaCUDAInteropHelpers.cmake b/cmake/NablaCUDAInteropHelpers.cmake
index 6486789aeb..9c1ac657d4 100644
--- a/cmake/NablaCUDAInteropHelpers.cmake
+++ b/cmake/NablaCUDAInteropHelpers.cmake
@@ -21,8 +21,7 @@ endfunction()
 
 function(_nbl_cuda_interop_make_runtime_paths_json _OUT_CONTENT)
 	set(_include_dirs ${ARGN})
-	set(_json "{\n  \"cudaRuntimeIncludeDirs\": [")
-	set(_first ON)
+	set(_cuda_runtime_include_dir_entries "")
 
 	foreach(_include_dir IN LISTS _include_dirs)
 		if("${_include_dir}" STREQUAL "")
@@ -32,21 +31,22 @@ function(_nbl_cuda_interop_make_runtime_paths_json _OUT_CONTENT)
 		file(TO_CMAKE_PATH "${_include_dir}" _include_dir_json)
 		string(REPLACE "\"" "\\\"" _include_dir_json "${_include_dir_json}")
 
-		if(_first)
-			string(APPEND _json "\n")
-			set(_first OFF)
-		else()
-			string(APPEND _json ",\n")
-		endif()
-		string(APPEND _json "    \"${_include_dir_json}\"")
+		list(APPEND _cuda_runtime_include_dir_entries "    \"${_include_dir_json}\"")
 	endforeach()
 
-	if(NOT _first)
-		string(APPEND _json "\n  ]\n}\n")
-	else()
-		string(APPEND _json "]\n}\n")
-	endif()
-
+	set(_json_entry_separator [=[
+,
+]=])
+	list(JOIN _cuda_runtime_include_dir_entries "${_json_entry_separator}" _cuda_runtime_include_dirs)
+
+	set(_json [=[
+{
+  "cudaRuntimeIncludeDirs": [
+@_cuda_runtime_include_dirs@
+  ]
+}
+]=])
+	string(CONFIGURE "${_json}" _json @ONLY)
 	set(${_OUT_CONTENT} "${_json}" PARENT_SCOPE)
 endfunction()
 
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index dd87d93e43..6833ad8189 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -9,6 +9,11 @@
 #include "nbl/asset/ICPUBuffer.h"
 #include "nbl/system/DynamicFunctionCaller.h"
 
+#include <concepts>
+#include <string>
+#include <type_traits>
+#include <utility>
+
 #include "cuda.h"
 #include "nvrtc.h"
 #if CUDA_VERSION < 13000
@@ -153,27 +158,64 @@ struct SExportableMemoryCreationParams
 	CUmemLocationType location;
 };
 
-NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
-NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
+namespace detail
+{
+
+template<typename>
+struct is_smart_refctd_ptr : std::false_type {};
+
+template<typename T>
+struct is_smart_refctd_ptr<core::smart_refctd_ptr<T>> : std::true_type {};
+
+template<typename T>
+inline constexpr bool is_smart_refctd_ptr_v = is_smart_refctd_ptr<std::remove_cvref_t<T>>::value;
 
-inline const CUDA& getCUDAFunctionTable(const CCUDAHandler* handler)
+template<typename T>
+inline constexpr bool is_indirect_object_v = std::is_pointer_v<std::remove_cvref_t<T>> || is_smart_refctd_ptr_v<T>;
+
+template<typename Object>
+decltype(auto) as_ref(Object&& object)
 {
-	return getCUDAFunctionTable(*handler);
+	using object_t = std::remove_cvref_t<Object>;
+	if constexpr (std::is_pointer_v<object_t>)
+		return *object;
+	else if constexpr (is_smart_refctd_ptr_v<Object>)
+		return *object;
+	else
+		return std::forward<Object>(object);
 }
 
-inline const CUDA& getCUDAFunctionTable(const core::smart_refctd_ptr<CCUDAHandler>& handler)
-{
-	return getCUDAFunctionTable(*handler);
+template<typename Object, typename Target>
+concept object_like = is_indirect_object_v<Object> && requires(Object&& object) {
+	{ as_ref(std::forward<Object>(object)) } -> std::convertible_to<Target&>;
+};
+
+template<typename Object, typename Target>
+concept const_object_like = is_indirect_object_v<Object> && requires(Object&& object) {
+	{ as_ref(std::forward<Object>(object)) } -> std::convertible_to<const Target&>;
+};
+
+template<typename Source>
+concept program_text_source = std::same_as<std::remove_cvref_t<Source>, std::string> ||
+	std::convertible_to<Source, const char*>;
+
 }
 
-inline const NVRTC& getNVRTCFunctionTable(const CCUDAHandler* handler)
+NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
+NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
+
+template<typename Handler>
+requires detail::const_object_like<Handler, CCUDAHandler>
+inline const CUDA& getCUDAFunctionTable(Handler&& handler)
 {
-	return getNVRTCFunctionTable(*handler);
+	return getCUDAFunctionTable(detail::as_ref(std::forward<Handler>(handler)));
 }
 
-inline const NVRTC& getNVRTCFunctionTable(const core::smart_refctd_ptr<CCUDAHandler>& handler)
+template<typename Handler>
+requires detail::const_object_like<Handler, CCUDAHandler>
+inline const NVRTC& getNVRTCFunctionTable(Handler&& handler)
 {
-	return getNVRTCFunctionTable(*handler);
+	return getNVRTCFunctionTable(detail::as_ref(std::forward<Handler>(handler)));
 }
 
 NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
@@ -185,14 +227,11 @@ T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
 
 NBL_API2 const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
 
-inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler* handler)
-{
-	return getAvailableDevices(*handler);
-}
-
-inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(const core::smart_refctd_ptr<CCUDAHandler>& handler)
+template<typename Handler>
+requires detail::const_object_like<Handler, CCUDAHandler>
+inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(Handler&& handler)
 {
-	return getAvailableDevices(*handler);
+	return getAvailableDevices(detail::as_ref(std::forward<Handler>(handler)));
 }
 
 NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
@@ -201,29 +240,26 @@ inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, cons
 	return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames);
 }
 NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
-inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames);
-}
-inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames);
-}
-inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	return createProgram(*handler,prog,source,name,headerCount,headerContents,includeNames);
-}
-inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	return createProgram(*handler,prog,source,name,headerCount,headerContents,includeNames);
-}
-inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+
+template<typename Handler, typename Source>
+requires detail::object_like<Handler, CCUDAHandler> && detail::program_text_source<Source>
+inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, Source&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
-	return createProgram(*handler,prog,file,headerCount,headerContents,includeNames);
+	auto& handlerRef = detail::as_ref(std::forward<Handler>(handler));
+	if constexpr (std::same_as<std::remove_cvref_t<Source>, std::string>)
+		return createProgram(handlerRef,prog,std::string(std::forward<Source>(source)),name,headerCount,headerContents,includeNames);
+	else
+	{
+		const char* sourceText = source;
+		return createProgram(handlerRef,prog,sourceText,name,headerCount,headerContents,includeNames);
+	}
 }
-inline nvrtcResult createProgram(const core::smart_refctd_ptr<CCUDAHandler>& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+
+template<typename Handler, typename File>
+requires detail::object_like<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
+inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, File file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
-	return createProgram(*handler,prog,file,headerCount,headerContents,includeNames);
+	return createProgram(detail::as_ref(std::forward<Handler>(handler)),prog,static_cast<system::IFile*>(file),headerCount,headerContents,includeNames);
 }
 NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
 NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
@@ -253,53 +289,34 @@ NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
 );
+
+template<typename Handler, typename Source>
+requires detail::object_like<Handler, CCUDAHandler> && detail::program_text_source<Source>
 inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler* handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	return compileDirectlyToPTX(*handler,std::move(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-}
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	const core::smart_refctd_ptr<CCUDAHandler>& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	return compileDirectlyToPTX(*handler,std::move(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-}
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler* handler, const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	return compileDirectlyToPTX(*handler,source,filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-}
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	const core::smart_refctd_ptr<CCUDAHandler>& handler, const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	return compileDirectlyToPTX(*handler,source,filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-}
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler* handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+	Handler&& handler, Source&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
 )
 {
-	return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log);
+	auto& handlerRef = detail::as_ref(std::forward<Handler>(handler));
+	if constexpr (std::same_as<std::remove_cvref_t<Source>, std::string>)
+		return compileDirectlyToPTX(handlerRef,std::string(std::forward<Source>(source)),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+	else
+	{
+		const char* sourceText = source;
+		return compileDirectlyToPTX(handlerRef,sourceText,filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
+	}
 }
+
+template<typename Handler, typename File>
+requires detail::object_like<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
 inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	const core::smart_refctd_ptr<CCUDAHandler>& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
+	Handler&& handler, File file, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
 )
 {
-	return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log);
+	return compileDirectlyToPTX(detail::as_ref(std::forward<Handler>(handler)),static_cast<system::IFile*>(file),nvrtcOptions,headerCount,headerContents,includeNames,log);
 }
 
 NBL_API2 CUdevice getInternalObject(const CCUDADevice& device);
@@ -311,84 +328,50 @@ NBL_API2 CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
 NBL_API2 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
 NBL_API2 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
 
-inline CUdevice getInternalObject(const CCUDADevice* device)
-{
-	return getInternalObject(*device);
-}
-
-inline CUdevice getInternalObject(const core::smart_refctd_ptr<CCUDADevice>& device)
-{
-	return getInternalObject(*device);
-}
-
-inline CUcontext getContext(const CCUDADevice* device)
-{
-	return getContext(*device);
-}
-
-inline CUcontext getContext(const core::smart_refctd_ptr<CCUDADevice>& device)
-{
-	return getContext(*device);
-}
-
-inline size_t roundToGranularity(const CCUDADevice* device, CUmemLocationType location, size_t size)
-{
-	return roundToGranularity(*device,location,size);
-}
-
-inline size_t roundToGranularity(const core::smart_refctd_ptr<CCUDADevice>& device, CUmemLocationType location, size_t size)
-{
-	return roundToGranularity(*device,location,size);
-}
-
-inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice* device, SExportableMemoryCreationParams&& params)
-{
-	return createExportableMemory(*device,std::move(params));
-}
-
-inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(const core::smart_refctd_ptr<CCUDADevice>& device, SExportableMemoryCreationParams&& params)
-{
-	return createExportableMemory(*device,std::move(params));
-}
-
-inline CUdeviceptr getDeviceptr(const CCUDAExportableMemory* memory)
-{
-	return getDeviceptr(*memory);
-}
-
-inline CUdeviceptr getDeviceptr(const core::smart_refctd_ptr<CCUDAExportableMemory>& memory)
-{
-	return getDeviceptr(*memory);
-}
-
-inline CUexternalMemory getInternalObject(const CCUDAImportedMemory* memory)
+template<typename Object>
+requires (
+	detail::const_object_like<Object, CCUDADevice> ||
+	detail::const_object_like<Object, CCUDAImportedMemory> ||
+	detail::const_object_like<Object, CCUDAImportedSemaphore>
+)
+inline auto getInternalObject(Object&& object)
 {
-	return getInternalObject(*memory);
+	return getInternalObject(detail::as_ref(std::forward<Object>(object)));
 }
 
-inline CUexternalMemory getInternalObject(const core::smart_refctd_ptr<CCUDAImportedMemory>& memory)
+template<typename Device>
+requires detail::const_object_like<Device, CCUDADevice>
+inline CUcontext getContext(Device&& device)
 {
-	return getInternalObject(*memory);
+	return getContext(detail::as_ref(std::forward<Device>(device)));
 }
 
-inline CUresult getMappedBuffer(const CCUDAImportedMemory* memory, CUdeviceptr* mappedBuffer)
+template<typename Device>
+requires detail::const_object_like<Device, CCUDADevice>
+inline size_t roundToGranularity(Device&& device, CUmemLocationType location, size_t size)
 {
-	return getMappedBuffer(*memory,mappedBuffer);
+	return roundToGranularity(detail::as_ref(std::forward<Device>(device)),location,size);
 }
 
-inline CUresult getMappedBuffer(const core::smart_refctd_ptr<CCUDAImportedMemory>& memory, CUdeviceptr* mappedBuffer)
+template<typename Device>
+requires detail::object_like<Device, CCUDADevice>
+inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(Device&& device, SExportableMemoryCreationParams&& params)
 {
-	return getMappedBuffer(*memory,mappedBuffer);
+	return createExportableMemory(detail::as_ref(std::forward<Device>(device)),std::move(params));
 }
 
-inline CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore* semaphore)
+template<typename Memory>
+requires detail::const_object_like<Memory, CCUDAExportableMemory>
+inline CUdeviceptr getDeviceptr(Memory&& memory)
 {
-	return getInternalObject(*semaphore);
+	return getDeviceptr(detail::as_ref(std::forward<Memory>(memory)));
 }
 
-inline CUexternalSemaphore getInternalObject(const core::smart_refctd_ptr<CCUDAImportedSemaphore>& semaphore)
+template<typename Memory>
+requires detail::const_object_like<Memory, CCUDAImportedMemory>
+inline CUresult getMappedBuffer(Memory&& memory, CUdeviceptr* mappedBuffer)
 {
-	return getInternalObject(*semaphore);
+	return getMappedBuffer(detail::as_ref(std::forward<Memory>(memory)),mappedBuffer);
 }
 
 }
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index fce7fd2b5a..13046d6d1e 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -5,11 +5,11 @@
 #include "nbl/ext/CUDAInterop/CUDAInterop.h"
 #include "nbl/system/ModuleLookupUtils.h"
 
+#include "nlohmann/json.hpp"
+
 #include <algorithm>
 #include <cstdlib>
 #include <fstream>
-#include <sstream>
-#include <string_view>
 #include <system_error>
 
 namespace nbl::video::cuda_interop
@@ -17,21 +17,11 @@ namespace nbl::video::cuda_interop
 namespace
 {
 
-std::string readEnvironmentVariable(std::string_view name)
+std::string readEnvironmentVariable(const char* name)
 {
-	#if defined(_NBL_PLATFORM_WINDOWS_)
-	char* value = nullptr;
-	size_t size = 0;
-	if (_dupenv_s(&value,&size,std::string(name).c_str()) || !value)
-		return {};
-	std::string result(value);
-	std::free(value);
-	return result;
-	#else
-	if (const char* value = std::getenv(std::string(name).c_str()))
+	if (const char* value = std::getenv(name))
 		return value;
 	return {};
-	#endif
 }
 
 bool isDirectory(const system::path& path)
@@ -90,50 +80,6 @@ void appendCUDAIncludeRoot(core::vector<system::path>& includeDirs, const system
 	appendIncludeDir(includeDirs,root/"include");
 }
 
-core::vector<std::string> parseStringArray(std::string_view text, std::string_view key)
-{
-	core::vector<std::string> values;
-	const std::string quotedKey = "\"" + std::string(key) + "\"";
-	const auto keyPos = text.find(quotedKey);
-	if (keyPos==std::string_view::npos)
-		return values;
-
-	const auto arrayBegin = text.find('[',keyPos+quotedKey.size());
-	if (arrayBegin==std::string_view::npos)
-		return values;
-	const auto arrayEnd = text.find(']',arrayBegin+1);
-	if (arrayEnd==std::string_view::npos)
-		return values;
-
-	for (auto pos = arrayBegin+1; pos<arrayEnd;)
-	{
-		const auto quoteBegin = text.find('"',pos);
-		if (quoteBegin==std::string_view::npos || quoteBegin>=arrayEnd)
-			break;
-
-		std::string value;
-		auto cursor = quoteBegin+1;
-		for (; cursor<arrayEnd; ++cursor)
-		{
-			const char c = text[cursor];
-			if (c=='\\')
-			{
-				if (++cursor<arrayEnd)
-					value.push_back(text[cursor]);
-				continue;
-			}
-			if (c=='"')
-				break;
-			value.push_back(c);
-		}
-
-		values.push_back(std::move(value));
-		pos = cursor+1;
-	}
-
-	return values;
-}
-
 void appendRuntimePathsConfig(core::vector<system::path>& includeDirs, const system::path& configFile)
 {
 	if (!isRegularFile(configFile))
@@ -143,13 +89,20 @@ void appendRuntimePathsConfig(core::vector<system::path>& includeDirs, const sys
 	if (!input)
 		return;
 
-	std::stringstream buffer;
-	buffer << input.rdbuf();
-	for (const auto& path : parseStringArray(buffer.str(),"cudaRuntimeIncludeDirs"))
-		appendIncludeDir(includeDirs,system::path(path));
+	const auto json = nlohmann::json::parse(input,nullptr,false);
+	if (json.is_discarded())
+		return;
+
+	const auto paths = json.find("cudaRuntimeIncludeDirs");
+	if (paths==json.end() || !paths->is_array())
+		return;
+
+	for (const auto& path : *paths)
+		if (path.is_string())
+			appendIncludeDir(includeDirs,system::path(path.get<std::string>()));
 }
 
-void appendRuntimePathsConfigEnv(core::vector<system::path>& includeDirs, std::string_view name)
+void appendRuntimePathsConfigEnv(core::vector<system::path>& includeDirs, const char* name)
 {
 	const auto value = readEnvironmentVariable(name);
 	if (value.empty())
@@ -218,7 +171,7 @@ void appendPythonPackageIncludeDirs(core::vector<system::path>& includeDirs, con
 	appendIncludeDir(includeDirs,root/"include");
 }
 
-void appendPathListEnv(core::vector<system::path>& includeDirs, std::string_view name)
+void appendPathListEnv(core::vector<system::path>& includeDirs, const char* name)
 {
 	const auto value = readEnvironmentVariable(name);
 	if (value.empty())
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 837f3ab28e..c75300016e 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -66,6 +66,33 @@ auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
 - `CUDA_PATH` is a developer fallback. It is not required for packaged applications.
 - Direct `target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop)` remains possible, but it only adds compile/link usage requirements and does not create the runtime discovery JSON.
 
+## Runtime Header Distribution
+
+Nabla packages do not ship CUDA runtime headers. That is a packaging choice, not a hard legal requirement for applications that need NVRTC runtime compilation.
+
+NVIDIA CUDA EULA limits CUDA redistribution to selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A then lists the CUDA Toolkit files that may be redistributed with applications. See:
+
+- https://docs.nvidia.com/cuda/eula/#distribution
+- https://docs.nvidia.com/cuda/eula/#attachment-a
+
+Relevant Attachment A header entries include:
+
+- `nvrtc.h` under `NVIDIA Runtime Compilation Library and Header`.
+- `cuda_occupancy.h` under `CUDA Occupancy Calculation Header Library`.
+- `cuda_fp16.h`, `cuda_fp16.hpp`, `cuda_bf16.h`, `cuda_bf16.hpp`, `cuda_fp8.h`, `cuda_fp8.hpp`, `cuda_fp6.h`, `cuda_fp6.hpp`, `cuda_fp4.h`, `cuda_fp4.hpp` under `CUDA Floating Point Type Headers`.
+- `crt/host_defines.h`, `cuComplex.h`, `cuda_awbarrier_helpers.h`, `cuda_awbarrier_primitives.h`, `cuda_awbarrier.h`, `cuda_pipeline_helpers.h`, `cuda_pipeline_primitives.h`, `cuda_pipeline.h`, `cuda_runtime_api.h`, `cuda.h`, `cuda/std/tuple`, `cuda/std/type_traits`, `cuda/std/utility`, `device_types.h`, `vector_functions.h`, `vector_types.h` under `CUDA Headers for Runtime Compilation`.
+
+CuPy documents the same runtime-compile problem. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They also show the common `vector_types.h` failure and recommend `nvidia-cuda-runtime-cu12` for PyPI installs or `cuda-cudart-dev` from system packages:
+
+- https://docs.cupy.dev/en/v13.5.0/install.html#cupy-always-raises-nvrtc-error-compilation-6
+- https://github.com/cupy/cupy/issues/8466
+
+For Nabla consumers this means:
+
+- The default Nabla package stays SDK-free for consumers that only link `Nabla::Nabla`.
+- Native interop consumers can install CUDA runtime headers through an official package, point `NBL_CUDA_INTEROP_RUNTIME_JSON` at their own JSON, pass `INCLUDE_DIRS` to `nbl_target_link_cuda_interop`, or ship an app-local header bundle if their distribution model allows it.
+- Shipping such headers is a consumer packaging decision. Nabla runtime discovery supports it, but Nabla does not install host-specific CUDA header paths or redistribute CUDA headers by default.
+
 ## Properties
 
 - Consumers that only link `Nabla::Nabla` do not need CUDA SDK headers to parse Nabla headers.

From 8a119dda501a7f9c6f979ee7e6d98e6840c04d35 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 13:24:10 +0200
Subject: [PATCH 18/51] Hide CUDA interop native state construction

---
 include/nbl/ext/CUDAInterop/CCUDADevice.h     |  7 ++--
 .../ext/CUDAInterop/CCUDAExportableMemory.h   |  6 ++-
 include/nbl/ext/CUDAInterop/CCUDAHandler.h    |  6 +--
 .../nbl/ext/CUDAInterop/CCUDAImportedMemory.h |  7 ++--
 .../ext/CUDAInterop/CCUDAImportedSemaphore.h  |  6 ++-
 src/nbl/ext/CUDAInterop/CCUDADevice.cpp       | 22 ++++++++---
 .../ext/CUDAInterop/CCUDAExportableMemory.cpp | 24 +++++++++++-
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp      | 16 ++++++--
 .../ext/CUDAInterop/CCUDAImportedMemory.cpp   |  8 +++-
 .../CUDAInterop/CCUDAImportedSemaphore.cpp    |  8 +++-
 .../CUDAInterop/CUDAInteropNativeState.hpp    |  9 +++++
 src/nbl/ext/CUDAInterop/README.md             | 38 +++++++++++++++++--
 12 files changed, 126 insertions(+), 31 deletions(-)

diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h
index 12465f40f4..94eb450802 100644
--- a/include/nbl/ext/CUDAInterop/CCUDADevice.h
+++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h
@@ -25,7 +25,6 @@ struct SAccess;
 class NBL_API2 CCUDADevice : public core::IReferenceCounted
 {
 	public:
-		struct SNativeState;
 #ifdef _WIN32
 		static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32;
 #else
@@ -68,8 +67,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 		};
 		inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;}
 
-		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr<SNativeState>&& nativeState, core::smart_refctd_ptr<CCUDAHandler>&& handler);
-
 		~CCUDADevice() override;
 
 		inline core::SRange<const char* const> geDefaultCompileOptions() const
@@ -86,8 +83,12 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 		core::smart_refctd_ptr<CCUDAImportedSemaphore> importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sem);
 
 	private:
+		friend class CCUDAHandler;
 		friend struct cuda_native::SAccess;
 
+		struct SNativeState;
+		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr<SNativeState>&& nativeState, core::smart_refctd_ptr<CCUDAHandler>&& handler);
+
 		static constexpr auto CudaMemoryLocationCount = 5;
 
 		const system::logger_opt_ptr m_logger;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
index 80a9b3630a..6d29739408 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
@@ -21,7 +21,6 @@ struct SAccess;
 class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 {
 	public:
-		struct SNativeState;
 		struct SCachedCreationParams
 		{
 			size_t size;
@@ -31,7 +30,6 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 			bool deviceLocal;
 		};
 
-		CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState);
 		~CCUDAExportableMemory() override;
 
 		core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
@@ -39,6 +37,10 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 	private:
 		friend struct cuda_native::SAccess;
 
+		struct SNativeState;
+		CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState);
+		static core::smart_refctd_ptr<CCUDAExportableMemory> create(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState);
+
 		core::smart_refctd_ptr<CCUDADevice> m_device;
 		SCachedCreationParams m_params;
 		std::unique_ptr<SNativeState> m_native;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
index bed4f9a31c..f6b5d578a8 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h
@@ -44,11 +44,8 @@ NBL_API2 core::vector<std::string> makeNVRTCIncludeOptions(const SRuntimeCompile
 class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 {
 	public:
-		struct SNativeState;
 		static core::smart_refctd_ptr<CCUDAHandler> create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger);
 
-		CCUDAHandler(std::unique_ptr<SNativeState>&& nativeState, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger, int _version);
-
 		inline core::SRange<system::IFile* const> getSTDHeaders()
 		{
 			auto begin = m_headers.empty() ? nullptr:(&m_headers[0].get());
@@ -75,6 +72,9 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 	private:
 		friend struct cuda_native::SAccess;
 
+		struct SNativeState;
+		CCUDAHandler(std::unique_ptr<SNativeState>&& nativeState, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger, int _version);
+
 		std::unique_ptr<SNativeState> m_native;
 		core::vector<SCUDADeviceInfo> m_availableDevices;
 		core::vector<core::smart_refctd_ptr<system::IFile>> m_headers;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
index adb803f12c..87f804ce76 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
@@ -19,14 +19,15 @@ struct SAccess;
 class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
 {
 	public:
-		struct SNativeState;
-		CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<nbl::video::IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState);
-
 		~CCUDAImportedMemory() override;
 
 	private:
+		friend class CCUDADevice;
 		friend struct cuda_native::SAccess;
 
+		struct SNativeState;
+		CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<nbl::video::IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState);
+
 		core::smart_refctd_ptr<CCUDADevice> m_device;
 		core::smart_refctd_ptr<IDeviceMemoryAllocation> m_src;
 		std::unique_ptr<SNativeState> m_native;
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
index 894f2444c0..c8bf77313e 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
+++ b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
@@ -22,13 +22,15 @@ struct SAccess;
 class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted
 {
 	public:
-		struct SNativeState;
-		CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState);
 		~CCUDAImportedSemaphore() override;
 
 	private:
+		friend class CCUDADevice;
 		friend struct cuda_native::SAccess;
 
+		struct SNativeState;
+		CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState);
+
 		core::smart_refctd_ptr<CCUDADevice> m_device;
 		core::smart_refctd_ptr<ISemaphore> m_src;
 		std::unique_ptr<SNativeState> m_native;
diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
index ebac00b7b4..8e696d0827 100644
--- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
@@ -27,6 +27,8 @@ CCUDADevice::CCUDADevice(
 	m_handler(std::move(handler)),
 	m_native(std::move(nativeState))
 {
+	assert(m_native);
+
 	m_defaultCompileOptions.push_back("--std=c++14");
 	m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]);
 	m_defaultCompileOptions.push_back("-dc");
@@ -150,7 +152,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 #endif
 	};
 
-	auto nativeState = std::make_unique<CCUDAExportableMemory::SNativeState>();
+	auto nativeState = SAccess::makeExportableMemoryNativeState();
 
 	CUmemGenericAllocationHandle mem;
 	if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err)
@@ -166,7 +168,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 		return nullptr;
 	}
 
-	if (const auto err = reserveAddressAndMapMemory(device,&nativeState->ptr, params.granularSize, params.alignment, inParams.location, mem); CUDA_SUCCESS != err)
+	if (const auto err = reserveAddressAndMapMemory(device,&SAccess::deviceptr(*nativeState), params.granularSize, params.alignment, inParams.location, mem); CUDA_SUCCESS != err)
 	{
 		logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
 
@@ -185,7 +187,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 		return nullptr;
 	}
 	
-	return core::make_smart_refctd_ptr<CCUDAExportableMemory>(core::smart_refctd_ptr<CCUDADevice>(&device), std::move(params), std::move(nativeState));
+	return SAccess::makeExportableMemory(core::smart_refctd_ptr<CCUDADevice>(&device),std::move(params),std::move(nativeState));
 }
 
 }
@@ -215,7 +217,10 @@ core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(co
 		m_logger.log("Fail to import external memory into CUDA!", system::ILogger::ELL_ERROR);
 		return nullptr;
 	}
-	return core::make_smart_refctd_ptr<CCUDAImportedMemory>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(mem), std::make_unique<CCUDAImportedMemory::SNativeState>(cuExtMem));
+	return core::smart_refctd_ptr<CCUDAImportedMemory>(
+		new CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice>(this),std::move(mem),std::make_unique<CCUDAImportedMemory::SNativeState>(cuExtMem)),
+		core::dont_grab
+	);
 }
 
 core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sema)
@@ -245,7 +250,10 @@ core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaph
 		return nullptr;
 	}
 	
-	return core::make_smart_refctd_ptr<CCUDAImportedSemaphore>(core::smart_refctd_ptr<CCUDADevice>(this), std::move(sema), std::make_unique<CCUDAImportedSemaphore::SNativeState>(cusema));
+	return core::smart_refctd_ptr<CCUDAImportedSemaphore>(
+		new CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice>(this),std::move(sema),std::make_unique<CCUDAImportedSemaphore::SNativeState>(cusema)),
+		core::dont_grab
+	);
 }
 
 CCUDADevice::~CCUDADevice()
@@ -275,7 +283,9 @@ CCUDADevice::CCUDADevice(
 	, m_virtualArchitecture(virtualArchitecture)
 	, m_handler(std::move(handler))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
 
 CCUDADevice::~CCUDADevice() = default;
 
diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
index a65d1b680c..7d5483af04 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
@@ -14,7 +14,17 @@ CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice>
 	: m_device(std::move(device))
 	, m_params(std::move(params))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
+
+core::smart_refctd_ptr<CCUDAExportableMemory> CCUDAExportableMemory::create(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState)
+{
+	return core::smart_refctd_ptr<CCUDAExportableMemory>(
+		new CCUDAExportableMemory(std::move(device),std::move(params),std::move(nativeState)),
+		core::dont_grab
+	);
+}
 
 core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const
 {
@@ -76,7 +86,17 @@ CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice>
 	: m_device(std::move(device))
 	, m_params(std::move(params))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
+
+core::smart_refctd_ptr<CCUDAExportableMemory> CCUDAExportableMemory::create(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState)
+{
+	return core::smart_refctd_ptr<CCUDAExportableMemory>(
+		new CCUDAExportableMemory(std::move(device),std::move(params),std::move(nativeState)),
+		core::dont_grab
+	);
+}
 
 CCUDAExportableMemory::~CCUDAExportableMemory() = default;
 
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 13046d6d1e..229a27cfac 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -309,6 +309,8 @@ CCUDAHandler::CCUDAHandler(
 	, m_logger(std::move(_logger))
 	, m_version(_version)
 {
+	assert(m_native);
+
 	for (auto& header : m_headers)
 	{
 		m_headerContents.push_back(reinterpret_cast<const char*>(header->getMappedPointer()));
@@ -858,7 +860,10 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 		));
 	}
 
-	return core::make_smart_refctd_ptr<CCUDAHandler>(std::make_unique<SNativeState>(std::move(cuda),std::move(nvrtc)), std::move(headers), std::move(_logger), cudaVersion);
+	return core::smart_refctd_ptr<CCUDAHandler>(
+		new CCUDAHandler(std::make_unique<SNativeState>(std::move(cuda),std::move(nvrtc)),std::move(headers),std::move(_logger),cudaVersion),
+		core::dont_grab
+	);
 }
 
 namespace cuda_native
@@ -1090,7 +1095,10 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 			if (arch==CCUDADevice::EVA_COUNT)
 				continue;
 
-			return core::make_smart_refctd_ptr<CCUDADevice>(std::move(vulkanConnection), physicalDevice, arch, std::make_unique<CCUDADevice::SNativeState>(device.handle), core::smart_refctd_ptr<CCUDAHandler>(this));
+			return core::smart_refctd_ptr<CCUDADevice>(
+				new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique<CCUDADevice::SNativeState>(device.handle),core::smart_refctd_ptr<CCUDAHandler>(this)),
+				core::dont_grab
+			);
 		}
 	}
 	return nullptr;
@@ -1115,7 +1123,9 @@ CCUDAHandler::CCUDAHandler(
 	, m_headers(std::move(_headers))
 	, m_logger(std::move(_logger))
 	, m_version(_version)
-{}
+{
+	assert(m_native);
+}
 
 CCUDAHandler::~CCUDAHandler() = default;
 
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
index 8de3ce3e63..3a8ed56371 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
@@ -14,7 +14,9 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> dev
 	: m_device(std::move(device))
 	, m_src(std::move(src))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
 
 namespace cuda_native
 {
@@ -57,7 +59,9 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> dev
 	: m_device(std::move(device))
 	, m_src(std::move(src))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
 
 CCUDAImportedMemory::~CCUDAImportedMemory() = default;
 
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
index fdbb56b0cf..6d980ed126 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
@@ -13,7 +13,9 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevic
 	: m_device(std::move(device))
 	, m_src(std::move(src))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
 
 namespace cuda_native
 {
@@ -44,7 +46,9 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevic
 	: m_device(std::move(device))
 	, m_src(std::move(src))
 	, m_native(std::move(nativeState))
-{}
+{
+	assert(m_native);
+}
 
 CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default;
 
diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
index 47701359ba..74cb7823d5 100644
--- a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
+++ b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
@@ -76,6 +76,15 @@ struct SAccess
 
 	static CCUDAExportableMemory::SNativeState& native(CCUDAExportableMemory& memory) { return *memory.m_native; }
 	static const CCUDAExportableMemory::SNativeState& native(const CCUDAExportableMemory& memory) { return *memory.m_native; }
+	static std::unique_ptr<CCUDAExportableMemory::SNativeState> makeExportableMemoryNativeState()
+	{
+		return std::unique_ptr<CCUDAExportableMemory::SNativeState>(new CCUDAExportableMemory::SNativeState());
+	}
+	static CUdeviceptr& deviceptr(CCUDAExportableMemory::SNativeState& nativeState) { return nativeState.ptr; }
+	static core::smart_refctd_ptr<CCUDAExportableMemory> makeExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, CCUDAExportableMemory::SCachedCreationParams&& params, std::unique_ptr<CCUDAExportableMemory::SNativeState>&& nativeState)
+	{
+		return CCUDAExportableMemory::create(std::move(device),std::move(params),std::move(nativeState));
+	}
 
 	static CCUDAImportedMemory::SNativeState& native(CCUDAImportedMemory& memory) { return *memory.m_native; }
 	static const CCUDAImportedMemory::SNativeState& native(const CCUDAImportedMemory& memory) { return *memory.m_native; }
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index c75300016e..214d5add14 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -11,18 +11,26 @@
 - Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInterop`.
 - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`.
 
-## Usage
+## Basic Usage
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED)
 target_link_libraries(app PRIVATE Nabla::Nabla)
 ```
 
+This path does not require CUDA SDK headers on the consuming project.
+
+## Native Opt-In
+
+Use the native opt-in path only in targets that include `CUDAInteropNative.h` or use raw CUDA Driver API/NVRTC types.
+
 ```cmake
 find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
 nbl_target_link_cuda_interop(native_app PRIVATE)
 ```
 
+`nbl_target_link_cuda_interop` links `Nabla::ext::CUDAInterop` and writes runtime CUDA header discovery JSON for `native_app`.
+
 ```cmake
 find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
 nbl_target_link_cuda_interop(native_app PRIVATE
@@ -36,19 +44,42 @@ nbl_target_link_cuda_interop(native_app PRIVATE
 )
 ```
 
+Pseudo flow:
+
 ```cpp
 #include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
 
-auto runtimeEnv = nbl::video::cuda_interop::findRuntimeCompileEnvironment();
-auto includeOptions = nbl::video::cuda_interop::makeNVRTCIncludeOptions(runtimeEnv);
+auto handler = nbl::video::CCUDAHandler::create(system, std::move(logger));
+auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDevice);
 
 auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
     .size = size,
     .alignment = alignment,
     .location = CU_MEM_LOCATION_TYPE_DEVICE,
 });
+
+std::string log;
+auto [ptx, result] = nbl::video::cuda_native::compileDirectlyToPTX(
+    handler,
+    cudaSource,
+    "kernel.cu",
+    cudaDevice->geDefaultCompileOptions(),
+    0,
+    nullptr,
+    nullptr,
+    &log
+);
 ```
 
+`compileDirectlyToPTX` performs runtime CUDA header discovery internally. Code that drives NVRTC manually can call `cuda_interop::findRuntimeCompileEnvironment` and `cuda_interop::makeNVRTCIncludeOptions` directly.
+
+Reference smoke:
+
+- CMake target setup: `src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt`
+- SDK-free package boundary check: `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp`
+- Default Nabla package usage without native opt-in: `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp`
+- Native CUDA opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC and raw interop usage: `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp`
+
 ## Runtime Header Discovery
 
 - `nbl_target_link_cuda_interop(<target> <scope>)` links `Nabla::ext::CUDAInterop` and configures runtime include discovery for that target.
@@ -100,6 +131,7 @@ For Nabla consumers this means:
 - Raw CUDA access is not wrapped away in the native opt-in path. Native code uses CUDA Driver API and NVRTC types directly.
 - CUDA SDK structs with version-sensitive layout are kept out of exported Nabla ABI.
 - The exported native ABI uses stable CUDA Driver API handles/enums and small Nabla-owned parameter structs.
+- Native state is PIMPL-owned by Nabla. Consumers cannot construct CUDA wrapper objects with arbitrary internal state.
 - A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla.
 - `CCUDAHandler::create` validates the loaded CUDA driver and NVRTC runtime. It returns `nullptr` when the runtime is missing or below the required CUDA 13.0 / NVRTC 13.x floor.
 - Runtime CUDA header discovery is independent from the CUDA SDK used to build Nabla.

From e018545fb659ee74400a2635f93f502cd1d0f4f3 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 13:49:22 +0200
Subject: [PATCH 19/51] Clean up CUDA runtime header discovery

---
 src/nbl/ext/CUDAInterop/CCUDAHandler.cpp | 112 ++++++++++++-----------
 src/nbl/ext/CUDAInterop/README.md        |   5 +-
 2 files changed, 64 insertions(+), 53 deletions(-)

diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
index 229a27cfac..de7f14b58f 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
@@ -17,6 +17,12 @@ namespace nbl::video::cuda_interop
 namespace
 {
 
+#if defined(_NBL_PLATFORM_WINDOWS_)
+inline constexpr char EnvironmentPathListSeparator = ';';
+#else
+inline constexpr char EnvironmentPathListSeparator = ':';
+#endif
+
 std::string readEnvironmentVariable(const char* name)
 {
 	if (const char* value = std::getenv(name))
@@ -71,6 +77,39 @@ void appendIncludeDir(core::vector<system::path>& includeDirs, system::path path
 		includeDirs.push_back(std::move(path));
 }
 
+void appendCUDAIncludeDirsBelow(core::vector<system::path>& includeDirs, const system::path& root, uint32_t maxDepth)
+{
+	if (!isDirectory(root))
+		return;
+
+	if (looksLikeCUDAIncludeDir(root))
+	{
+		appendIncludeDir(includeDirs,root);
+		return;
+	}
+	if (maxDepth==0u)
+		return;
+
+	core::vector<system::path> candidates;
+	std::error_code error;
+	for (const auto& entry : std::filesystem::directory_iterator(root,error))
+	{
+		if (error)
+			break;
+
+		std::error_code entryError;
+		if (!entry.is_directory(entryError))
+			continue;
+		candidates.push_back(entry.path());
+	}
+
+	std::sort(candidates.begin(),candidates.end(),[](const system::path& lhs, const system::path& rhs) {
+		return lhs.generic_string()>rhs.generic_string();
+	});
+	for (const auto& candidate : candidates)
+		appendCUDAIncludeDirsBelow(includeDirs,candidate,maxDepth-1u);
+}
+
 void appendCUDAIncludeRoot(core::vector<system::path>& includeDirs, const system::path& root)
 {
 	if (root.empty())
@@ -102,24 +141,20 @@ void appendRuntimePathsConfig(core::vector<system::path>& includeDirs, const sys
 			appendIncludeDir(includeDirs,system::path(path.get<std::string>()));
 }
 
-void appendRuntimePathsConfigEnv(core::vector<system::path>& includeDirs, const char* name)
+template<typename Append>
+void appendPathListEnv(const char* name, Append append)
 {
 	const auto value = readEnvironmentVariable(name);
 	if (value.empty())
 		return;
 
-	#if defined(_NBL_PLATFORM_WINDOWS_)
-	constexpr char Separator = ';';
-	#else
-	constexpr char Separator = ':';
-	#endif
-
 	size_t begin = 0;
 	while (begin<value.size())
 	{
-		const auto end = value.find(Separator,begin);
+		const auto end = value.find(EnvironmentPathListSeparator,begin);
 		const auto segment = value.substr(begin,end==std::string::npos ? std::string::npos:end-begin);
-		appendRuntimePathsConfig(includeDirs,system::path(segment));
+		if (!segment.empty())
+			append(system::path(segment));
 		if (end==std::string::npos)
 			break;
 		begin = end+1;
@@ -131,21 +166,13 @@ void appendRuntimePathsConfigs(core::vector<system::path>& includeDirs, const co
 	for (const auto& runtimePathFile : explicitRuntimePathFiles)
 		appendRuntimePathsConfig(includeDirs,runtimePathFile);
 
-	appendRuntimePathsConfigEnv(includeDirs,"NBL_CUDA_INTEROP_RUNTIME_JSON");
-	appendRuntimePathsConfigEnv(includeDirs,"Nabla_CUDA_INTEROP_RUNTIME_JSON");
+	const auto appendConfig = [&](const system::path& path) { appendRuntimePathsConfig(includeDirs,path); };
+	appendPathListEnv("NBL_CUDA_INTEROP_RUNTIME_JSON",appendConfig);
+	appendPathListEnv("Nabla_CUDA_INTEROP_RUNTIME_JSON",appendConfig);
 
 	const auto exeDir = system::executableDirectory();
 	if (!exeDir.empty())
 		appendRuntimePathsConfig(includeDirs,exeDir/RuntimePathsFileName);
-
-	#if defined(_NBL_PLATFORM_WINDOWS_)
-	const auto releaseModuleDir = system::loadedModuleDirectory("Nabla.dll");
-	if (!releaseModuleDir.empty())
-		appendRuntimePathsConfig(includeDirs,releaseModuleDir/RuntimePathsFileName);
-	const auto debugModuleDir = system::loadedModuleDirectory("Nabla_debug.dll");
-	if (!debugModuleDir.empty())
-		appendRuntimePathsConfig(includeDirs,debugModuleDir/RuntimePathsFileName);
-	#endif
 }
 
 void appendAppLocalIncludeDirs(core::vector<system::path>& includeDirs)
@@ -155,9 +182,10 @@ void appendAppLocalIncludeDirs(core::vector<system::path>& includeDirs)
 		return;
 
 	appendIncludeDir(includeDirs,exeDir/"cuda"/"include");
-	appendIncludeDir(includeDirs,exeDir/"nvidia"/"cu13"/"include");
+	appendCUDAIncludeDirsBelow(includeDirs,exeDir/"nvidia",4u);
 	appendIncludeDir(includeDirs,exeDir/"Libraries"/"cuda"/"include");
 	appendIncludeDir(includeDirs,exeDir.parent_path()/"cuda"/"include");
+	appendCUDAIncludeDirsBelow(includeDirs,exeDir.parent_path()/"nvidia",4u);
 }
 
 void appendPythonPackageIncludeDirs(core::vector<system::path>& includeDirs, const system::path& root)
@@ -165,40 +193,17 @@ void appendPythonPackageIncludeDirs(core::vector<system::path>& includeDirs, con
 	if (root.empty())
 		return;
 
-	appendIncludeDir(includeDirs,root/"Lib"/"site-packages"/"nvidia"/"cu13"/"include");
-	appendIncludeDir(includeDirs,root/"lib"/"site-packages"/"nvidia"/"cu13"/"include");
+	appendCUDAIncludeDirsBelow(includeDirs,root/"Lib"/"site-packages"/"nvidia",4u);
+	appendCUDAIncludeDirsBelow(includeDirs,root/"lib"/"site-packages"/"nvidia",4u);
 	appendIncludeDir(includeDirs,root/"Library"/"include");
 	appendIncludeDir(includeDirs,root/"include");
 }
 
-void appendPathListEnv(core::vector<system::path>& includeDirs, const char* name)
-{
-	const auto value = readEnvironmentVariable(name);
-	if (value.empty())
-		return;
-
-	#if defined(_NBL_PLATFORM_WINDOWS_)
-	constexpr char Separator = ';';
-	#else
-	constexpr char Separator = ':';
-	#endif
-
-	size_t begin = 0;
-	while (begin<value.size())
-	{
-		const auto end = value.find(Separator,begin);
-		const auto segment = value.substr(begin,end==std::string::npos ? std::string::npos:end-begin);
-		appendIncludeDir(includeDirs,system::path(segment));
-		if (end==std::string::npos)
-			break;
-		begin = end+1;
-	}
-}
-
 void appendEnvironmentIncludeDirs(core::vector<system::path>& includeDirs)
 {
-	appendPathListEnv(includeDirs,"NBL_CUDA_RUNTIME_INCLUDE_DIRS");
-	appendPathListEnv(includeDirs,"Nabla_CUDA_RUNTIME_INCLUDE_DIRS");
+	const auto appendInclude = [&](const system::path& path) { appendIncludeDir(includeDirs,path); };
+	appendPathListEnv("NBL_CUDA_RUNTIME_INCLUDE_DIRS",appendInclude);
+	appendPathListEnv("Nabla_CUDA_RUNTIME_INCLUDE_DIRS",appendInclude);
 
 	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_PATH"));
 	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_HOME"));
@@ -942,13 +947,18 @@ ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
 	return {std::move(ptx),SAccess::native(handler).nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
 }
 
+static const core::vector<std::string>& getDefaultRuntimeIncludeOptions()
+{
+	static const auto RuntimeIncludeOptions = cuda_interop::makeNVRTCIncludeOptions(cuda_interop::findRuntimeCompileEnvironment());
+	return RuntimeIncludeOptions;
+}
+
 static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
 {
 	if (result!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
-	const auto runtimeEnvironment = cuda_interop::findRuntimeCompileEnvironment();
-	const auto runtimeIncludeOptions = cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment);
+	const auto& runtimeIncludeOptions = getDefaultRuntimeIncludeOptions();
 	core::vector<const char*> options;
 	options.reserve(nvrtcOptions.size()+runtimeIncludeOptions.size());
 	for (const auto option : nvrtcOptions)
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 214d5add14..0d7b01a033 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -90,8 +90,9 @@ Reference smoke:
 - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
 - The JSON is a build artifact. Nabla packages do not install JSON files with host-specific CUDA paths.
 - Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`.
-- Runtime lookup reads `nbl_cuda_interop_runtime.json` first, then checks app-local include bundles, explicit environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
-- `cuda_native::compileDirectlyToPTX` appends discovered include directories to the NVRTC option list.
+- Runtime lookup reads explicit JSON paths and `NBL_CUDA_INTEROP_RUNTIME_JSON` first, then checks executable-local `nbl_cuda_interop_runtime.json`, app-local include bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
+- App-local and Python/conda package probing looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in the path.
+- `cuda_native::compileDirectlyToPTX` appends discovered include directories to the NVRTC option list and caches the default discovery result after first use.
 - Production machines do not need the full CUDA SDK just because Nabla was built with CUDA.
 - If an application compiles CUDA source with NVRTC and includes headers such as `cuda_fp16.h`, it must provide those runtime headers through the generated JSON path, an app-local bundle, a runtime/header package, or an installed toolkit.
 - `CUDA_PATH` is a developer fallback. It is not required for packaged applications.

From c6ef6eea004ceeb2b25378f1312deb79cd21f283 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 14:30:58 +0200
Subject: [PATCH 20/51] Move CUDA interop API back into video

---
 include/nbl/ext/CUDAInterop/CUDAInterop.h     |  13 --
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   |   2 +-
 include/nbl/ext/OptiX/IDenoiser.h             |   2 +-
 .../{ext/CUDAInterop => video}/CCUDADevice.h  |   6 +-
 .../CCUDAExportableMemory.h                   |   0
 .../{ext/CUDAInterop => video}/CCUDAHandler.h |   0
 .../CCUDAImportedMemory.h                     |   4 +-
 .../CCUDAImportedSemaphore.h                  |   0
 include/nbl/video/CUDAInterop.h               |  13 ++
 src/nbl/CMakeLists.txt                        |  10 +-
 src/nbl/ext/CUDAInterop/README.md             | 138 ++++++++----------
 .../ext/CUDAInterop/smoke/clean_opt_in.cpp    |   2 +-
 .../ext/CUDAInterop/smoke/public_boundary.cpp |   2 +-
 .../CUDAInterop => video}/CCUDADevice.cpp     |   2 +-
 .../CCUDAExportableMemory.cpp                 |   2 +-
 .../CUDAInterop => video}/CCUDAHandler.cpp    |   2 +-
 .../CCUDAImportedMemory.cpp                   |   2 +-
 .../CCUDAImportedSemaphore.cpp                |   2 +-
 .../CUDAInteropNativeState.hpp                |   4 +-
 19 files changed, 91 insertions(+), 115 deletions(-)
 delete mode 100644 include/nbl/ext/CUDAInterop/CUDAInterop.h
 rename include/nbl/{ext/CUDAInterop => video}/CCUDADevice.h (94%)
 rename include/nbl/{ext/CUDAInterop => video}/CCUDAExportableMemory.h (100%)
 rename include/nbl/{ext/CUDAInterop => video}/CCUDAHandler.h (100%)
 rename include/nbl/{ext/CUDAInterop => video}/CCUDAImportedMemory.h (86%)
 rename include/nbl/{ext/CUDAInterop => video}/CCUDAImportedSemaphore.h (100%)
 create mode 100644 include/nbl/video/CUDAInterop.h
 rename src/nbl/{ext/CUDAInterop => video}/CCUDADevice.cpp (99%)
 rename src/nbl/{ext/CUDAInterop => video}/CCUDAExportableMemory.cpp (98%)
 rename src/nbl/{ext/CUDAInterop => video}/CCUDAHandler.cpp (99%)
 rename src/nbl/{ext/CUDAInterop => video}/CCUDAImportedMemory.cpp (97%)
 rename src/nbl/{ext/CUDAInterop => video}/CCUDAImportedSemaphore.cpp (97%)
 rename src/nbl/{ext/CUDAInterop => video}/CUDAInteropNativeState.hpp (96%)

diff --git a/include/nbl/ext/CUDAInterop/CUDAInterop.h b/include/nbl/ext/CUDAInterop/CUDAInterop.h
deleted file mode 100644
index 06d9016dc8..0000000000
--- a/include/nbl/ext/CUDAInterop/CUDAInterop.h
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-#ifndef _NBL_EXT_CUDA_INTEROP_H_INCLUDED_
-#define _NBL_EXT_CUDA_INTEROP_H_INCLUDED_
-
-#include "nbl/ext/CUDAInterop/CCUDADevice.h"
-#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
-#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
-#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
-
-#endif
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index 6833ad8189..9d23fcb4ef 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -4,7 +4,7 @@
 #ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_
 #define _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_
 
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 
 #include "nbl/asset/ICPUBuffer.h"
 #include "nbl/system/DynamicFunctionCaller.h"
diff --git a/include/nbl/ext/OptiX/IDenoiser.h b/include/nbl/ext/OptiX/IDenoiser.h
index 496383d92d..bb0677657d 100644
--- a/include/nbl/ext/OptiX/IDenoiser.h
+++ b/include/nbl/ext/OptiX/IDenoiser.h
@@ -5,7 +5,7 @@
 #ifndef __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__
 #define __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__
 
-#include "nbl/ext/CUDAInterop/CCUDAHandler.h"
+#include "nbl/video/CCUDAHandler.h"
 
 #include <optix.h>
 #include <optix_denoiser_tiling.h>
diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
similarity index 94%
rename from include/nbl/ext/CUDAInterop/CCUDADevice.h
rename to include/nbl/video/CCUDADevice.h
index 94eb450802..bc1931e363 100644
--- a/include/nbl/ext/CUDAInterop/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -5,9 +5,9 @@
 #define _NBL_VIDEO_C_CUDA_DEVICE_H_
 
 #include "nbl/video/declarations.h"
-#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h"
-#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h"
-#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h"
+#include "nbl/video/CCUDAExportableMemory.h"
+#include "nbl/video/CCUDAImportedMemory.h"
+#include "nbl/video/CCUDAImportedSemaphore.h"
 
 #include <cstring>
 #include <memory>
diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h
similarity index 100%
rename from include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h
rename to include/nbl/video/CCUDAExportableMemory.h
diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
similarity index 100%
rename from include/nbl/ext/CUDAInterop/CCUDAHandler.h
rename to include/nbl/video/CCUDAHandler.h
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h
similarity index 86%
rename from include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
rename to include/nbl/video/CCUDAImportedMemory.h
index 87f804ce76..ac41c110a2 100644
--- a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h
+++ b/include/nbl/video/CCUDAImportedMemory.h
@@ -1,5 +1,5 @@
-#ifndef _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_
-#define _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_
+#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_
+#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_
 
 #include "nbl/video/declarations.h"
 
diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h
similarity index 100%
rename from include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h
rename to include/nbl/video/CCUDAImportedSemaphore.h
diff --git a/include/nbl/video/CUDAInterop.h b/include/nbl/video/CUDAInterop.h
new file mode 100644
index 0000000000..57e92ae647
--- /dev/null
+++ b/include/nbl/video/CUDAInterop.h
@@ -0,0 +1,13 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_
+#define _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_
+
+#include "nbl/video/CCUDADevice.h"
+#include "nbl/video/CCUDAExportableMemory.h"
+#include "nbl/video/CCUDAHandler.h"
+#include "nbl/video/CCUDAImportedMemory.h"
+#include "nbl/video/CCUDAImportedSemaphore.h"
+
+#endif
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index ccb600ca32..d56c223e34 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -126,11 +126,11 @@ set(NBL_CORE_SOURCES
 )
 
 set(NBL_CUDA_INTEROP_SOURCES
-	ext/CUDAInterop/CCUDADevice.cpp
-	ext/CUDAInterop/CCUDAExportableMemory.cpp
-	ext/CUDAInterop/CCUDAHandler.cpp
-	ext/CUDAInterop/CCUDAImportedMemory.cpp
-	ext/CUDAInterop/CCUDAImportedSemaphore.cpp
+	video/CCUDADevice.cpp
+	video/CCUDAExportableMemory.cpp
+	video/CCUDAHandler.cpp
+	video/CCUDAImportedMemory.cpp
+	video/CCUDAImportedSemaphore.cpp
 )
 
 set(NBL_SYSTEM_SOURCES
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 0d7b01a033..e99edd82c0 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -1,50 +1,50 @@
-# CUDA Interop Targets
+# CUDA Interop
 
-- `Nabla::Nabla` owns the CUDA interop implementation and exported symbols.
-- `Nabla::Nabla` public headers do not include `cuda.h` or `nvrtc.h`.
-- The SDK-free interop headers stay stable for CUDA ON and CUDA OFF Nabla builds.
-- `Nabla::ext::CUDAInterop` is the explicit raw CUDA Driver API and NVRTC opt-in target.
-- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It does not build a library or executable artifact.
-- The target only carries usage requirements and IDE-visible sources.
-- `Nabla::ext::CUDAInterop` requires `CUDAToolkit` and exposes `CUDAInteropNative.h`.
-- `CUDAInteropNative.h` is the small opt-in header that includes CUDA SDK headers such as `cuda.h` and `nvrtc.h`.
-- Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>` when requesting `CUDAInterop`.
-- Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`.
+## Layout
 
-## Basic Usage
+- `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and its implementation in `src/nbl/video/CCUDA*.cpp`.
+- Those headers do not include CUDA SDK headers. Consumers that only link `Nabla::Nabla` do not need `cuda.h`, `nvrtc.h`, or a CUDA SDK install just to parse Nabla headers.
+- `Nabla::ext::CUDAInterop` is an `INTERFACE` target for native CUDA opt-in. It builds no library. It only adds `CUDAInteropNative.h`, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop.
+- `CUDAInteropNative.h` is the only public opt-in header that includes CUDA SDK headers and exposes `cuda_native::*` accessors for CUDA Driver API and NVRTC types.
+
+## CMake Usage
+
+Default Nabla usage stays SDK-free:
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED)
 target_link_libraries(app PRIVATE Nabla::Nabla)
 ```
 
-This path does not require CUDA SDK headers on the consuming project.
-
-## Native Opt-In
-
-Use the native opt-in path only in targets that include `CUDAInteropNative.h` or use raw CUDA Driver API/NVRTC types.
+Native CUDA interop is explicit:
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
 nbl_target_link_cuda_interop(native_app PRIVATE)
 ```
 
-`nbl_target_link_cuda_interop` links `Nabla::ext::CUDAInterop` and writes runtime CUDA header discovery JSON for `native_app`.
+`nbl_target_link_cuda_interop` links `Nabla::ext::CUDAInterop` and writes `nbl_cuda_interop_runtime.json` next to the target executable during CMake generation.
+
+Optional overrides:
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
 nbl_target_link_cuda_interop(native_app PRIVATE
     INCLUDE_DIRS "${cuda_runtime_headers}"
 )
-```
 
-```cmake
 nbl_target_link_cuda_interop(native_app PRIVATE
     RUNTIME_JSON "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/my_cuda_runtime.json"
 )
 ```
 
-Pseudo flow:
+Consumers can also choose the SDK used for native compilation with:
+
+```cmake
+cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>
+```
+
+## Native Usage
 
 ```cpp
 #include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
@@ -71,85 +71,61 @@ auto [ptx, result] = nbl::video::cuda_native::compileDirectlyToPTX(
 );
 ```
 
-`compileDirectlyToPTX` performs runtime CUDA header discovery internally. Code that drives NVRTC manually can call `cuda_interop::findRuntimeCompileEnvironment` and `cuda_interop::makeNVRTCIncludeOptions` directly.
+Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC types directly.
+
+Smoke examples:
+
+- `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` checks that `Nabla::Nabla` headers stay SDK-free.
+- `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` checks default package usage without native opt-in.
+- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks native opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, and raw interop usage.
 
-Reference smoke:
+## ABI
 
-- CMake target setup: `src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt`
-- SDK-free package boundary check: `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp`
-- Default Nabla package usage without native opt-in: `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp`
-- Native CUDA opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC and raw interop usage: `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp`
+- `CCUDAHandler`, `CCUDADevice`, `CCUDAExportableMemory`, `CCUDAImportedMemory`, and `CCUDAImportedSemaphore` are exported from `Nabla.dll` through the normal Nabla ABI.
+- Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes.
+- CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state.
+- `CUDAInteropNative.h` declares exported accessor functions whose definitions still live in `Nabla.dll`.
+- Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small Nabla-owned parameter structs.
+- A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. The loaded driver and NVRTC runtime are still validated at runtime.
 
 ## Runtime Header Discovery
 
-- `nbl_target_link_cuda_interop(<target> <scope>)` links `Nabla::ext::CUDAInterop` and configures runtime include discovery for that target.
-- The helper is defined once in `NablaCUDAInteropHelpers.cmake` and is available from the source tree and installed `NablaConfig.cmake`.
-- For each target it writes `nbl_cuda_interop_runtime.json` next to the executable during CMake generation.
-- `RUNTIME_JSON <path>` overrides the generated JSON location. Plain paths and `$<CONFIG>` are supported.
-- `cuda_interop::findRuntimeCompileEnvironment` can also receive explicit JSON paths at runtime.
-- `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
-- The JSON is a build artifact. Nabla packages do not install JSON files with host-specific CUDA paths.
-- Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`.
-- Runtime lookup reads explicit JSON paths and `NBL_CUDA_INTEROP_RUNTIME_JSON` first, then checks executable-local `nbl_cuda_interop_runtime.json`, app-local include bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
-- App-local and Python/conda package probing looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in the path.
-- `cuda_native::compileDirectlyToPTX` appends discovered include directories to the NVRTC option list and caches the default discovery result after first use.
-- Production machines do not need the full CUDA SDK just because Nabla was built with CUDA.
-- If an application compiles CUDA source with NVRTC and includes headers such as `cuda_fp16.h`, it must provide those runtime headers through the generated JSON path, an app-local bundle, a runtime/header package, or an installed toolkit.
-- `CUDA_PATH` is a developer fallback. It is not required for packaged applications.
-- Direct `target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop)` remains possible, but it only adds compile/link usage requirements and does not create the runtime discovery JSON.
+NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`.
 
-## Runtime Header Distribution
+- `nbl_target_link_cuda_interop` generates `nbl_cuda_interop_runtime.json` for the target that opted into native CUDA interop.
+- The JSON is a build artifact. Nabla packages do not install host-specific CUDA paths.
+- Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`.
+- `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
+- Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
+- The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths.
+- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options and caches the default discovery result after first use.
 
-Nabla packages do not ship CUDA runtime headers. That is a packaging choice, not a hard legal requirement for applications that need NVRTC runtime compilation.
+Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit.
 
-NVIDIA CUDA EULA limits CUDA redistribution to selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A then lists the CUDA Toolkit files that may be redistributed with applications. See:
+Nabla does not ship CUDA runtime headers by default. NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See:
 
 - https://docs.nvidia.com/cuda/eula/#distribution
 - https://docs.nvidia.com/cuda/eula/#attachment-a
 
-Relevant Attachment A header entries include:
+Attachment A includes header groups relevant to NVRTC runtime compilation, including `nvrtc.h`, `cuda_fp16.h`, `cuda_bf16.h`, `cuda_fp8.h`, `cuda_fp6.h`, `cuda_fp4.h`, `cuda_runtime_api.h`, `cuda.h`, `vector_functions.h`, and `vector_types.h`.
 
-- `nvrtc.h` under `NVIDIA Runtime Compilation Library and Header`.
-- `cuda_occupancy.h` under `CUDA Occupancy Calculation Header Library`.
-- `cuda_fp16.h`, `cuda_fp16.hpp`, `cuda_bf16.h`, `cuda_bf16.hpp`, `cuda_fp8.h`, `cuda_fp8.hpp`, `cuda_fp6.h`, `cuda_fp6.hpp`, `cuda_fp4.h`, `cuda_fp4.hpp` under `CUDA Floating Point Type Headers`.
-- `crt/host_defines.h`, `cuComplex.h`, `cuda_awbarrier_helpers.h`, `cuda_awbarrier_primitives.h`, `cuda_awbarrier.h`, `cuda_pipeline_helpers.h`, `cuda_pipeline_primitives.h`, `cuda_pipeline.h`, `cuda_runtime_api.h`, `cuda.h`, `cuda/std/tuple`, `cuda/std/type_traits`, `cuda/std/utility`, `device_types.h`, `vector_functions.h`, `vector_types.h` under `CUDA Headers for Runtime Compilation`.
-
-CuPy documents the same runtime-compile problem. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They also show the common `vector_types.h` failure and recommend `nvidia-cuda-runtime-cu12` for PyPI installs or `cuda-cudart-dev` from system packages:
+CuPy documents the same NVRTC issue for CUDA 12.2+. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They show the common `vector_types.h` failure and recommend CUDA runtime header packages for PyPI/system package installs:
 
 - https://docs.cupy.dev/en/v13.5.0/install.html#cupy-always-raises-nvrtc-error-compilation-6
 - https://github.com/cupy/cupy/issues/8466
 
-For Nabla consumers this means:
-
-- The default Nabla package stays SDK-free for consumers that only link `Nabla::Nabla`.
-- Native interop consumers can install CUDA runtime headers through an official package, point `NBL_CUDA_INTEROP_RUNTIME_JSON` at their own JSON, pass `INCLUDE_DIRS` to `nbl_target_link_cuda_interop`, or ship an app-local header bundle if their distribution model allows it.
-- Shipping such headers is a consumer packaging decision. Nabla runtime discovery supports it, but Nabla does not install host-specific CUDA header paths or redistribute CUDA headers by default.
-
-## Properties
-
-- Consumers that only link `Nabla::Nabla` do not need CUDA SDK headers to parse Nabla headers.
-- Consumers that need raw CUDA include `CUDAInteropNative.h` and link `Nabla::ext::CUDAInterop`.
-- Raw CUDA access is not wrapped away in the native opt-in path. Native code uses CUDA Driver API and NVRTC types directly.
-- CUDA SDK structs with version-sensitive layout are kept out of exported Nabla ABI.
-- The exported native ABI uses stable CUDA Driver API handles/enums and small Nabla-owned parameter structs.
-- Native state is PIMPL-owned by Nabla. Consumers cannot construct CUDA wrapper objects with arbitrary internal state.
-- A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla.
-- `CCUDAHandler::create` validates the loaded CUDA driver and NVRTC runtime. It returns `nullptr` when the runtime is missing or below the required CUDA 13.0 / NVRTC 13.x floor.
-- Runtime CUDA header discovery is independent from the CUDA SDK used to build Nabla.
-- Native consumers can use a newer compatible CUDA SDK or a runtime/header package without rebuilding Nabla.
-- Toggling Nabla CUDA support does not change SDK-free public header parse requirements for consumers.
-- The Nabla source list is stable. CUDA interop `.cpp` files stay visible in IDE projects for CUDA ON and CUDA OFF builds.
-- CUDA OFF implementations are local stubs in the same `.cpp` files. SDK-free API entry points stay linkable and factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
+## CUDA ON/OFF Builds
+
+- SDK-free public headers stay stable for CUDA ON and CUDA OFF Nabla builds.
 - CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`.
+- CUDA OFF implementations are local stubs in the same `.cpp` files. Factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
+- The Nabla source list stays stable, so CUDA interop `.cpp` files remain visible in IDE projects for both CUDA ON and CUDA OFF builds.
 
 ## Related Designs
 
-This split follows the same public-boundary pattern used by mature GPU projects: SDK-free default headers, native access through an explicit opt-in path, and SDK-dependent implementation details outside the default public API.
+The split follows the same boundary pattern used by mature GPU projects: default headers avoid vendor SDK requirements, native access is explicit, and implementation details stay outside the default public API.
 
-- OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79).
-- OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61).
-- Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: [`device.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27).
-- Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: [`device_impl.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30), [`device.cpp`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.cpp#L10-L48).
-- ONNX Runtime keeps accelerator dependencies behind execution providers and supports provider shared libraries loaded only when requested: [`Build with Execution Providers`](https://onnxruntime.ai/docs/build/eps.html#execution-provider-shared-libraries).
-- ggml/llama.cpp keeps the generic backend API separate from CUDA and builds CUDA as an explicit backend target with CUDA libraries linked to that backend: [`ggml-backend.h`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/include/ggml-backend.h#L1488-L1499), [`ggml-cuda CMakeLists.txt`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-cuda/CMakeLists.txt#L982-L1072).
-- TensorFlow PluggableDevice uses separate device plugin packages so accelerator toolchains and dependencies do not become core TensorFlow requirements: [`PluggableDevice`](https://blog.tensorflow.org/2021/06/pluggabledevice-device-plugins-for-TensorFlow.html).
+- OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79
+- OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61
+- Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27
+- Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30
diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
index e36fe65701..31bf461804 100644
--- a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
@@ -1,4 +1,4 @@
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 #include "nbl/system/IApplicationFramework.h"
 
 #include <type_traits>
diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
index eb7061f0ee..dc1c247806 100644
--- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
@@ -18,7 +18,7 @@
 #error "Nabla consumers must not include CUDA SDK headers."
 #endif
 
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #error "Nabla consumers must not get the CUDA opt-in define."
diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
similarity index 99%
rename from src/nbl/ext/CUDAInterop/CCUDADevice.cpp
rename to src/nbl/video/CCUDADevice.cpp
index 8e696d0827..fcafc8bc48 100644
--- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #include "CUDAInteropNativeState.hpp"
diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp
similarity index 98%
rename from src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
rename to src/nbl/video/CCUDAExportableMemory.cpp
index 7d5483af04..4eb37b720a 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp
+++ b/src/nbl/video/CCUDAExportableMemory.cpp
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #include "CUDAInteropNativeState.hpp"
diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
similarity index 99%
rename from src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
rename to src/nbl/video/CCUDAHandler.cpp
index de7f14b58f..ced76b9713 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 #include "nbl/system/ModuleLookupUtils.h"
 
 #include "nlohmann/json.hpp"
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp
similarity index 97%
rename from src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
rename to src/nbl/video/CCUDAImportedMemory.cpp
index 3a8ed56371..9e58fbac10 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp
+++ b/src/nbl/video/CCUDAImportedMemory.cpp
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #include "CUDAInteropNativeState.hpp"
diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp
similarity index 97%
rename from src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
rename to src/nbl/video/CCUDAImportedSemaphore.cpp
index 6d980ed126..bc1db625d1 100644
--- a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/video/CCUDAImportedSemaphore.cpp
@@ -2,7 +2,7 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/video/CUDAInterop.h"
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #include "CUDAInteropNativeState.hpp"
diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp
similarity index 96%
rename from src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
rename to src/nbl/video/CUDAInteropNativeState.hpp
index 74cb7823d5..79139d015d 100644
--- a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp
+++ b/src/nbl/video/CUDAInteropNativeState.hpp
@@ -1,5 +1,5 @@
-#ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
-#define _NBL_EXT_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
+#ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
+#define _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
 
 #include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
 

From d559a2caeafa9aef0c308b7716c77d4be076fc28 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 15:02:03 +0200
Subject: [PATCH 21/51] Move smart pointer helpers into core

---
 include/nbl/core/decl/smart_refctd_ptr.h      | 38 ++++++++
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 90 ++++++-------------
 2 files changed, 66 insertions(+), 62 deletions(-)

diff --git a/include/nbl/core/decl/smart_refctd_ptr.h b/include/nbl/core/decl/smart_refctd_ptr.h
index 7c231fea4b..78609fa34c 100644
--- a/include/nbl/core/decl/smart_refctd_ptr.h
+++ b/include/nbl/core/decl/smart_refctd_ptr.h
@@ -7,6 +7,10 @@
 
 #include "nbl/core/IReferenceCounted.h"
 
+#include <concepts>
+#include <type_traits>
+#include <utility>
+
 namespace nbl::core
 {
 
@@ -118,6 +122,40 @@ class smart_refctd_ptr
 };
 static_assert(sizeof(smart_refctd_ptr<IReferenceCounted>) == sizeof(IReferenceCounted*), "smart_refctd_ptr has a memory overhead!");
 
+template<typename>
+struct is_smart_refctd_ptr : std::false_type {};
+
+template<typename T>
+struct is_smart_refctd_ptr<smart_refctd_ptr<T>> : std::true_type {};
+
+template<typename T>
+inline constexpr bool is_smart_refctd_ptr_v = is_smart_refctd_ptr<std::remove_cvref_t<T>>::value;
+
+template<typename T>
+inline constexpr bool is_raw_pointer_or_smart_refctd_ptr_v = std::is_pointer_v<std::remove_cvref_t<T>> || is_smart_refctd_ptr_v<T>;
+
+template<typename Object>
+decltype(auto) dereference(Object&& object)
+{
+	using object_t = std::remove_cvref_t<Object>;
+	if constexpr (std::is_pointer_v<object_t>)
+		return *object;
+	else if constexpr (is_smart_refctd_ptr_v<Object>)
+		return *object;
+	else
+		return std::forward<Object>(object);
+}
+
+template<typename Object, typename Target>
+concept dereferenceable_to = is_raw_pointer_or_smart_refctd_ptr_v<Object> && requires(Object&& object) {
+	{ dereference(std::forward<Object>(object)) } -> std::convertible_to<Target&>;
+};
+
+template<typename Object, typename Target>
+concept const_dereferenceable_to = is_raw_pointer_or_smart_refctd_ptr_v<Object> && requires(Object&& object) {
+	{ dereference(std::forward<Object>(object)) } -> std::convertible_to<const Target&>;
+};
+
 
 template< class T, class... Args >
 smart_refctd_ptr<T> make_smart_refctd_ptr(Args&& ... args);
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index 9d23fcb4ef..fe5fb5875e 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -161,40 +161,6 @@ struct SExportableMemoryCreationParams
 namespace detail
 {
 
-template<typename>
-struct is_smart_refctd_ptr : std::false_type {};
-
-template<typename T>
-struct is_smart_refctd_ptr<core::smart_refctd_ptr<T>> : std::true_type {};
-
-template<typename T>
-inline constexpr bool is_smart_refctd_ptr_v = is_smart_refctd_ptr<std::remove_cvref_t<T>>::value;
-
-template<typename T>
-inline constexpr bool is_indirect_object_v = std::is_pointer_v<std::remove_cvref_t<T>> || is_smart_refctd_ptr_v<T>;
-
-template<typename Object>
-decltype(auto) as_ref(Object&& object)
-{
-	using object_t = std::remove_cvref_t<Object>;
-	if constexpr (std::is_pointer_v<object_t>)
-		return *object;
-	else if constexpr (is_smart_refctd_ptr_v<Object>)
-		return *object;
-	else
-		return std::forward<Object>(object);
-}
-
-template<typename Object, typename Target>
-concept object_like = is_indirect_object_v<Object> && requires(Object&& object) {
-	{ as_ref(std::forward<Object>(object)) } -> std::convertible_to<Target&>;
-};
-
-template<typename Object, typename Target>
-concept const_object_like = is_indirect_object_v<Object> && requires(Object&& object) {
-	{ as_ref(std::forward<Object>(object)) } -> std::convertible_to<const Target&>;
-};
-
 template<typename Source>
 concept program_text_source = std::same_as<std::remove_cvref_t<Source>, std::string> ||
 	std::convertible_to<Source, const char*>;
@@ -205,17 +171,17 @@ NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
 NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
 
 template<typename Handler>
-requires detail::const_object_like<Handler, CCUDAHandler>
+requires core::const_dereferenceable_to<Handler, CCUDAHandler>
 inline const CUDA& getCUDAFunctionTable(Handler&& handler)
 {
-	return getCUDAFunctionTable(detail::as_ref(std::forward<Handler>(handler)));
+	return getCUDAFunctionTable(core::dereference(std::forward<Handler>(handler)));
 }
 
 template<typename Handler>
-requires detail::const_object_like<Handler, CCUDAHandler>
+requires core::const_dereferenceable_to<Handler, CCUDAHandler>
 inline const NVRTC& getNVRTCFunctionTable(Handler&& handler)
 {
-	return getNVRTCFunctionTable(detail::as_ref(std::forward<Handler>(handler)));
+	return getNVRTCFunctionTable(core::dereference(std::forward<Handler>(handler)));
 }
 
 NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
@@ -228,10 +194,10 @@ T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
 NBL_API2 const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
 
 template<typename Handler>
-requires detail::const_object_like<Handler, CCUDAHandler>
+requires core::const_dereferenceable_to<Handler, CCUDAHandler>
 inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(Handler&& handler)
 {
-	return getAvailableDevices(detail::as_ref(std::forward<Handler>(handler)));
+	return getAvailableDevices(core::dereference(std::forward<Handler>(handler)));
 }
 
 NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
@@ -242,10 +208,10 @@ inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, cons
 NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 
 template<typename Handler, typename Source>
-requires detail::object_like<Handler, CCUDAHandler> && detail::program_text_source<Source>
+requires core::dereferenceable_to<Handler, CCUDAHandler> && detail::program_text_source<Source>
 inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, Source&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
-	auto& handlerRef = detail::as_ref(std::forward<Handler>(handler));
+	auto& handlerRef = core::dereference(std::forward<Handler>(handler));
 	if constexpr (std::same_as<std::remove_cvref_t<Source>, std::string>)
 		return createProgram(handlerRef,prog,std::string(std::forward<Source>(source)),name,headerCount,headerContents,includeNames);
 	else
@@ -256,10 +222,10 @@ inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, Source&&
 }
 
 template<typename Handler, typename File>
-requires detail::object_like<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
+requires core::dereferenceable_to<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
 inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, File file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 {
-	return createProgram(detail::as_ref(std::forward<Handler>(handler)),prog,static_cast<system::IFile*>(file),headerCount,headerContents,includeNames);
+	return createProgram(core::dereference(std::forward<Handler>(handler)),prog,static_cast<system::IFile*>(file),headerCount,headerContents,includeNames);
 }
 NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
 NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
@@ -291,14 +257,14 @@ NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX(
 );
 
 template<typename Handler, typename Source>
-requires detail::object_like<Handler, CCUDAHandler> && detail::program_text_source<Source>
+requires core::dereferenceable_to<Handler, CCUDAHandler> && detail::program_text_source<Source>
 inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	Handler&& handler, Source&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
 )
 {
-	auto& handlerRef = detail::as_ref(std::forward<Handler>(handler));
+	auto& handlerRef = core::dereference(std::forward<Handler>(handler));
 	if constexpr (std::same_as<std::remove_cvref_t<Source>, std::string>)
 		return compileDirectlyToPTX(handlerRef,std::string(std::forward<Source>(source)),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
 	else
@@ -309,14 +275,14 @@ inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 }
 
 template<typename Handler, typename File>
-requires detail::object_like<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
+requires core::dereferenceable_to<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
 inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
 	Handler&& handler, File file, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
 	std::string* log=nullptr
 )
 {
-	return compileDirectlyToPTX(detail::as_ref(std::forward<Handler>(handler)),static_cast<system::IFile*>(file),nvrtcOptions,headerCount,headerContents,includeNames,log);
+	return compileDirectlyToPTX(core::dereference(std::forward<Handler>(handler)),static_cast<system::IFile*>(file),nvrtcOptions,headerCount,headerContents,includeNames,log);
 }
 
 NBL_API2 CUdevice getInternalObject(const CCUDADevice& device);
@@ -330,48 +296,48 @@ NBL_API2 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& sem
 
 template<typename Object>
 requires (
-	detail::const_object_like<Object, CCUDADevice> ||
-	detail::const_object_like<Object, CCUDAImportedMemory> ||
-	detail::const_object_like<Object, CCUDAImportedSemaphore>
+	core::const_dereferenceable_to<Object, CCUDADevice> ||
+	core::const_dereferenceable_to<Object, CCUDAImportedMemory> ||
+	core::const_dereferenceable_to<Object, CCUDAImportedSemaphore>
 )
 inline auto getInternalObject(Object&& object)
 {
-	return getInternalObject(detail::as_ref(std::forward<Object>(object)));
+	return getInternalObject(core::dereference(std::forward<Object>(object)));
 }
 
 template<typename Device>
-requires detail::const_object_like<Device, CCUDADevice>
+requires core::const_dereferenceable_to<Device, CCUDADevice>
 inline CUcontext getContext(Device&& device)
 {
-	return getContext(detail::as_ref(std::forward<Device>(device)));
+	return getContext(core::dereference(std::forward<Device>(device)));
 }
 
 template<typename Device>
-requires detail::const_object_like<Device, CCUDADevice>
+requires core::const_dereferenceable_to<Device, CCUDADevice>
 inline size_t roundToGranularity(Device&& device, CUmemLocationType location, size_t size)
 {
-	return roundToGranularity(detail::as_ref(std::forward<Device>(device)),location,size);
+	return roundToGranularity(core::dereference(std::forward<Device>(device)),location,size);
 }
 
 template<typename Device>
-requires detail::object_like<Device, CCUDADevice>
+requires core::dereferenceable_to<Device, CCUDADevice>
 inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(Device&& device, SExportableMemoryCreationParams&& params)
 {
-	return createExportableMemory(detail::as_ref(std::forward<Device>(device)),std::move(params));
+	return createExportableMemory(core::dereference(std::forward<Device>(device)),std::move(params));
 }
 
 template<typename Memory>
-requires detail::const_object_like<Memory, CCUDAExportableMemory>
+requires core::const_dereferenceable_to<Memory, CCUDAExportableMemory>
 inline CUdeviceptr getDeviceptr(Memory&& memory)
 {
-	return getDeviceptr(detail::as_ref(std::forward<Memory>(memory)));
+	return getDeviceptr(core::dereference(std::forward<Memory>(memory)));
 }
 
 template<typename Memory>
-requires detail::const_object_like<Memory, CCUDAImportedMemory>
+requires core::const_dereferenceable_to<Memory, CCUDAImportedMemory>
 inline CUresult getMappedBuffer(Memory&& memory, CUdeviceptr* mappedBuffer)
 {
-	return getMappedBuffer(detail::as_ref(std::forward<Memory>(memory)),mappedBuffer);
+	return getMappedBuffer(core::dereference(std::forward<Memory>(memory)),mappedBuffer);
 }
 
 }

From 38705b93794e820417a2b3f223d258e07aeebb8f Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 16:06:26 +0200
Subject: [PATCH 22/51] Use CUDA interop accessors

---
 examples_tests                                |   2 +-
 include/nbl/core/decl/smart_refctd_ptr.h      |  39 ----
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 219 ++++--------------
 src/nbl/ext/CUDAInterop/README.md             |  22 +-
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |  18 +-
 src/nbl/video/CCUDADevice.cpp                 |  51 ++--
 src/nbl/video/CCUDAExportableMemory.cpp       |  10 +-
 src/nbl/video/CCUDAHandler.cpp                |  61 ++---
 src/nbl/video/CCUDAImportedMemory.cpp         |  11 +-
 src/nbl/video/CCUDAImportedSemaphore.cpp      |   7 +-
 src/nbl/video/CUDAInteropNativeState.hpp      |  17 +-
 11 files changed, 135 insertions(+), 322 deletions(-)

diff --git a/examples_tests b/examples_tests
index b2c639c8b7..1dc7f6a075 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit b2c639c8b71c3b860418dc4b3e46ad147ba5f256
+Subproject commit 1dc7f6a075c8c457b80388e59ef3da846bad03e4
diff --git a/include/nbl/core/decl/smart_refctd_ptr.h b/include/nbl/core/decl/smart_refctd_ptr.h
index 78609fa34c..814c807a84 100644
--- a/include/nbl/core/decl/smart_refctd_ptr.h
+++ b/include/nbl/core/decl/smart_refctd_ptr.h
@@ -7,10 +7,6 @@
 
 #include "nbl/core/IReferenceCounted.h"
 
-#include <concepts>
-#include <type_traits>
-#include <utility>
-
 namespace nbl::core
 {
 
@@ -122,41 +118,6 @@ class smart_refctd_ptr
 };
 static_assert(sizeof(smart_refctd_ptr<IReferenceCounted>) == sizeof(IReferenceCounted*), "smart_refctd_ptr has a memory overhead!");
 
-template<typename>
-struct is_smart_refctd_ptr : std::false_type {};
-
-template<typename T>
-struct is_smart_refctd_ptr<smart_refctd_ptr<T>> : std::true_type {};
-
-template<typename T>
-inline constexpr bool is_smart_refctd_ptr_v = is_smart_refctd_ptr<std::remove_cvref_t<T>>::value;
-
-template<typename T>
-inline constexpr bool is_raw_pointer_or_smart_refctd_ptr_v = std::is_pointer_v<std::remove_cvref_t<T>> || is_smart_refctd_ptr_v<T>;
-
-template<typename Object>
-decltype(auto) dereference(Object&& object)
-{
-	using object_t = std::remove_cvref_t<Object>;
-	if constexpr (std::is_pointer_v<object_t>)
-		return *object;
-	else if constexpr (is_smart_refctd_ptr_v<Object>)
-		return *object;
-	else
-		return std::forward<Object>(object);
-}
-
-template<typename Object, typename Target>
-concept dereferenceable_to = is_raw_pointer_or_smart_refctd_ptr_v<Object> && requires(Object&& object) {
-	{ dereference(std::forward<Object>(object)) } -> std::convertible_to<Target&>;
-};
-
-template<typename Object, typename Target>
-concept const_dereferenceable_to = is_raw_pointer_or_smart_refctd_ptr_v<Object> && requires(Object&& object) {
-	{ dereference(std::forward<Object>(object)) } -> std::convertible_to<const Target&>;
-};
-
-
 template< class T, class... Args >
 smart_refctd_ptr<T> make_smart_refctd_ptr(Args&& ... args);
 
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index fe5fb5875e..57669f591a 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -9,10 +9,7 @@
 #include "nbl/asset/ICPUBuffer.h"
 #include "nbl/system/DynamicFunctionCaller.h"
 
-#include <concepts>
 #include <string>
-#include <type_traits>
-#include <utility>
 
 #include "cuda.h"
 #include "nvrtc.h"
@@ -158,196 +155,62 @@ struct SExportableMemoryCreationParams
 	CUmemLocationType location;
 };
 
-namespace detail
-{
-
-template<typename Source>
-concept program_text_source = std::same_as<std::remove_cvref_t<Source>, std::string> ||
-	std::convertible_to<Source, const char*>;
-
-}
-
-NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
-NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
-
-template<typename Handler>
-requires core::const_dereferenceable_to<Handler, CCUDAHandler>
-inline const CUDA& getCUDAFunctionTable(Handler&& handler)
-{
-	return getCUDAFunctionTable(core::dereference(std::forward<Handler>(handler)));
-}
-
-template<typename Handler>
-requires core::const_dereferenceable_to<Handler, CCUDAHandler>
-inline const NVRTC& getNVRTCFunctionTable(Handler&& handler)
-{
-	return getNVRTCFunctionTable(core::dereference(std::forward<Handler>(handler)));
-}
-
-NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
-NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
-NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
-
-template<typename T>
-T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast<T*>(ptr); }
-
-NBL_API2 const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
-
-template<typename Handler>
-requires core::const_dereferenceable_to<Handler, CCUDAHandler>
-inline const core::vector<SCUDADeviceInfo>& getAvailableDevices(Handler&& handler)
-{
-	return getAvailableDevices(core::dereference(std::forward<Handler>(handler)));
-}
-
-NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
-inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames);
-}
-NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
-
-template<typename Handler, typename Source>
-requires core::dereferenceable_to<Handler, CCUDAHandler> && detail::program_text_source<Source>
-inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, Source&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	auto& handlerRef = core::dereference(std::forward<Handler>(handler));
-	if constexpr (std::same_as<std::remove_cvref_t<Source>, std::string>)
-		return createProgram(handlerRef,prog,std::string(std::forward<Source>(source)),name,headerCount,headerContents,includeNames);
-	else
-	{
-		const char* sourceText = source;
-		return createProgram(handlerRef,prog,sourceText,name,headerCount,headerContents,includeNames);
-	}
-}
-
-template<typename Handler, typename File>
-requires core::dereferenceable_to<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
-inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, File file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-{
-	return createProgram(core::dereference(std::forward<Handler>(handler)),prog,static_cast<system::IFile*>(file),headerCount,headerContents,includeNames);
-}
-NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
-NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
-
 struct ptx_and_nvrtcResult_t
 {
 	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
 	nvrtcResult result;
 };
 
-NBL_API2 ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
-NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-);
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler& handler, const char* source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	return compileDirectlyToPTX(handler,std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-}
-NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-);
-
-template<typename Handler, typename Source>
-requires core::dereferenceable_to<Handler, CCUDAHandler> && detail::program_text_source<Source>
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	Handler&& handler, Source&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	auto& handlerRef = core::dereference(std::forward<Handler>(handler));
-	if constexpr (std::same_as<std::remove_cvref_t<Source>, std::string>)
-		return compileDirectlyToPTX(handlerRef,std::string(std::forward<Source>(source)),filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-	else
-	{
-		const char* sourceText = source;
-		return compileDirectlyToPTX(handlerRef,sourceText,filename,nvrtcOptions,headerCount,headerContents,includeNames,log);
-	}
-}
-
-template<typename Handler, typename File>
-requires core::dereferenceable_to<Handler, CCUDAHandler> && std::convertible_to<File, system::IFile*>
-inline ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	Handler&& handler, File file, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-	std::string* log=nullptr
-)
-{
-	return compileDirectlyToPTX(core::dereference(std::forward<Handler>(handler)),static_cast<system::IFile*>(file),nvrtcOptions,headerCount,headerContents,includeNames,log);
-}
-
-NBL_API2 CUdevice getInternalObject(const CCUDADevice& device);
-NBL_API2 CUcontext getContext(const CCUDADevice& device);
-NBL_API2 size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
-NBL_API2 core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params);
-NBL_API2 CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
-NBL_API2 CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
-NBL_API2 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
-NBL_API2 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
-
-template<typename Object>
-requires (
-	core::const_dereferenceable_to<Object, CCUDADevice> ||
-	core::const_dereferenceable_to<Object, CCUDAImportedMemory> ||
-	core::const_dereferenceable_to<Object, CCUDAImportedSemaphore>
-)
-inline auto getInternalObject(Object&& object)
-{
-	return getInternalObject(core::dereference(std::forward<Object>(object)));
-}
-
-template<typename Device>
-requires core::const_dereferenceable_to<Device, CCUDADevice>
-inline CUcontext getContext(Device&& device)
-{
-	return getContext(core::dereference(std::forward<Device>(device)));
-}
+// These are opt-in CUDA-native declarations for symbols implemented and exported by Nabla.
+// Only consumers that include this header and link Nabla::ext::CUDAInterop see CUDA SDK types.
+class NBL_API2 CCUDAHandlerAccessor
+{
+	public:
+		static const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
+		static const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
+		static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
+		static bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
+		static bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
+		static const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
+		static nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+		static nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
+		static nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
+		static ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
+		static ptx_and_nvrtcResult_t compileDirectlyToPTX(
+			CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
+			std::string* log=nullptr
+		);
+};
 
-template<typename Device>
-requires core::const_dereferenceable_to<Device, CCUDADevice>
-inline size_t roundToGranularity(Device&& device, CUmemLocationType location, size_t size)
+class NBL_API2 CCUDADeviceAccessor
 {
-	return roundToGranularity(core::dereference(std::forward<Device>(device)),location,size);
-}
+	public:
+		static CUdevice getInternalObject(const CCUDADevice& device);
+		static CUcontext getContext(const CCUDADevice& device);
+		static size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
+		static core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params);
+};
 
-template<typename Device>
-requires core::dereferenceable_to<Device, CCUDADevice>
-inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(Device&& device, SExportableMemoryCreationParams&& params)
+class NBL_API2 CCUDAExportableMemoryAccessor
 {
-	return createExportableMemory(core::dereference(std::forward<Device>(device)),std::move(params));
-}
+	public:
+		static CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
+};
 
-template<typename Memory>
-requires core::const_dereferenceable_to<Memory, CCUDAExportableMemory>
-inline CUdeviceptr getDeviceptr(Memory&& memory)
+class NBL_API2 CCUDAImportedMemoryAccessor
 {
-	return getDeviceptr(core::dereference(std::forward<Memory>(memory)));
-}
+	public:
+		static CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
+		static CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
+};
 
-template<typename Memory>
-requires core::const_dereferenceable_to<Memory, CCUDAImportedMemory>
-inline CUresult getMappedBuffer(Memory&& memory, CUdeviceptr* mappedBuffer)
+class NBL_API2 CCUDAImportedSemaphoreAccessor
 {
-	return getMappedBuffer(core::dereference(std::forward<Memory>(memory)),mappedBuffer);
-}
+	public:
+		static CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
+};
 
 }
 
-#define ASSERT_CUDA_SUCCESS(expr, handler) \
-	do { \
-		const auto cudaResult = (expr); \
-		if (!nbl::video::cuda_native::defaultHandleResult(*(handler), cudaResult)) { \
-			assert(false); \
-		} \
-	} while(0)
-
 #endif
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index e99edd82c0..ea92dcec7d 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -5,7 +5,7 @@
 - `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and its implementation in `src/nbl/video/CCUDA*.cpp`.
 - Those headers do not include CUDA SDK headers. Consumers that only link `Nabla::Nabla` do not need `cuda.h`, `nvrtc.h`, or a CUDA SDK install just to parse Nabla headers.
 - `Nabla::ext::CUDAInterop` is an `INTERFACE` target for native CUDA opt-in. It builds no library. It only adds `CUDAInteropNative.h`, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop.
-- `CUDAInteropNative.h` is the only public opt-in header that includes CUDA SDK headers and exposes `cuda_native::*` accessors for CUDA Driver API and NVRTC types.
+- `CUDAInteropNative.h` is the only public opt-in header that includes CUDA SDK headers and exposes `cuda_native::*Accessor` classes for CUDA Driver API and NVRTC types.
 
 ## CMake Usage
 
@@ -52,16 +52,17 @@ cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>
 auto handler = nbl::video::CCUDAHandler::create(system, std::move(logger));
 auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDevice);
 
-auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, {
+auto memory = nbl::video::cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, {
     .size = size,
     .alignment = alignment,
     .location = CU_MEM_LOCATION_TYPE_DEVICE,
 });
 
 std::string log;
-auto [ptx, result] = nbl::video::cuda_native::compileDirectlyToPTX(
-    handler,
-    cudaSource,
+std::string cudaSource = loadKernelText();
+auto [ptx, result] = nbl::video::cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
+    *handler,
+    std::move(cudaSource),
     "kernel.cu",
     cudaDevice->geDefaultCompileOptions(),
     0,
@@ -71,7 +72,12 @@ auto [ptx, result] = nbl::video::cuda_native::compileDirectlyToPTX(
 );
 ```
 
-Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC types directly.
+Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC types directly through accessor classes:
+
+- `CCUDAHandlerAccessor` exposes CUDA/NVRTC function tables, NVRTC program helpers, PTX compilation, native device enumeration, and default error handling.
+- `CCUDADeviceAccessor` exposes `CUdevice`, `CUcontext`, memory granularity, and CUDA allocation creation.
+- `CCUDAExportableMemoryAccessor`, `CCUDAImportedMemoryAccessor`, and `CCUDAImportedSemaphoreAccessor` expose the raw CUDA handles needed for interop.
+- Accessor methods take explicit Nabla references. Callers dereference `smart_refctd_ptr` at the call site instead of going through pointer/smart-pointer convenience overloads.
 
 Smoke examples:
 
@@ -84,7 +90,7 @@ Smoke examples:
 - `CCUDAHandler`, `CCUDADevice`, `CCUDAExportableMemory`, `CCUDAImportedMemory`, and `CCUDAImportedSemaphore` are exported from `Nabla.dll` through the normal Nabla ABI.
 - Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes.
 - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state.
-- `CUDAInteropNative.h` declares exported accessor functions whose definitions still live in `Nabla.dll`.
+- `CUDAInteropNative.h` declares exported accessor classes whose definitions still live in `Nabla.dll`. The opt-in header owns only the CUDA SDK surface. Nabla owns the implementation and ABI.
 - Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small Nabla-owned parameter structs.
 - A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. The loaded driver and NVRTC runtime are still validated at runtime.
 
@@ -98,7 +104,7 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud
 - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
 - Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
 - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths.
-- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options and caches the default discovery result after first use.
+- `cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX` appends discovered include directories to NVRTC options and caches the default discovery result after first use.
 
 Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit.
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index 3b799a56cf..0b07bfa137 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -23,7 +23,7 @@ using namespace nbl::video;
 	core::smart_refctd_ptr<IDeviceMemoryAllocation> vulkanMemory,
 	core::smart_refctd_ptr<ISemaphore> vulkanSemaphore)
 {
-	auto cudaMemory = cuda_native::createExportableMemory(cudaDevice, {
+	auto cudaMemory = cuda_native::CCUDADeviceAccessor::createExportableMemory(cudaDevice, {
 		.size = 4096,
 		.alignment = 4096,
 		.location = CU_MEM_LOCATION_TYPE_DEVICE,
@@ -37,16 +37,16 @@ using namespace nbl::video;
 
 	CUdeviceptr mappedVulkanMemory = 0;
 	if (importedFromVulkan)
-		cuda_native::getMappedBuffer(importedFromVulkan,&mappedVulkanMemory);
+		cuda_native::CCUDAImportedMemoryAccessor::getMappedBuffer(*importedFromVulkan,&mappedVulkanMemory);
 
-	const CUdeviceptr cudaDevicePtr = cuda_native::getDeviceptr(cudaMemory);
-	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::getInternalObject(importedSemaphore):nullptr;
+	const CUdeviceptr cudaDevicePtr = cuda_native::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaMemory);
+	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::CCUDAImportedSemaphoreAccessor::getInternalObject(*importedSemaphore):nullptr;
 	return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore;
 }
 
 bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 {
-	auto& cuda = cuda_native::getCUDAFunctionTable(handler);
+	auto& cuda = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(handler);
 
 	CUcontext context = nullptr;
 	if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS)
@@ -95,9 +95,9 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler)
 	)cuda";
 
 	std::string log;
-	auto [ptx, result] = cuda_native::compileDirectlyToPTX(
+	auto [ptx, result] = cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
 		handler,
-		Source,
+		std::string(Source),
 		"cuda_fp16_discovery_probe.cu",
 		{nullptr,nullptr},
 		0,
@@ -121,7 +121,7 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 		if (!isAPILoaded())
 			return false;
 
-		static_assert(std::is_same_v<decltype(nbl::video::cuda_native::getInternalObject(std::declval<const nbl::video::CCUDADevice&>())), CUdevice>);
+		static_assert(std::is_same_v<decltype(nbl::video::cuda_native::CCUDADeviceAccessor::getInternalObject(std::declval<const nbl::video::CCUDADevice&>())), CUdevice>);
 
 		#ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON
 		const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment({}, {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON});
@@ -144,7 +144,7 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 		if (!cudaFp16HeaderCompileProbe(*handler))
 			return false;
 
-		const auto& devices = nbl::video::cuda_native::getAvailableDevices(handler);
+		const auto& devices = nbl::video::cuda_native::CCUDAHandlerAccessor::getAvailableDevices(*handler);
 		if (devices.empty())
 			return true;
 
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index fcafc8bc48..359cd093a1 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -34,10 +34,12 @@ CCUDADevice::CCUDADevice(
 	m_defaultCompileOptions.push_back("-dc");
 	m_defaultCompileOptions.push_back("-use_fast_math");
 
-  const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
+  const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler);
 	
-	ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle), m_handler);
-	ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_native->context), m_handler);
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle)))
+		assert(false);
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context)))
+		assert(false);
 
 	for (uint32_t locationType = 0; locationType < m_native->allocationGranularity.size(); ++locationType)
 	{
@@ -50,30 +52,31 @@ CCUDADevice::CCUDADevice(
 
 	  const auto prop = CUmemAllocationProp{
       .type = CU_MEM_ALLOCATION_TYPE_PINNED,
-      .requestedHandleTypes = cuda_native::getAllocationHandleType(),
+      .requestedHandleTypes = cuda_native::SAccess::allocationHandleType(),
       .location = { .type = static_cast<CUmemLocationType>(locationType), .id = m_native->handle },
   #ifdef _WIN32
       .win32HandleMetaData = &metadata,
   #endif
     };
-		ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler);
+		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)))
+			assert(false);
 	}
 }
 
 namespace cuda_native
 {
 
-CUdevice getInternalObject(const CCUDADevice& device)
+CUdevice CCUDADeviceAccessor::getInternalObject(const CCUDADevice& device)
 {
 	return SAccess::native(device).handle;
 }
 
-CUcontext getContext(const CCUDADevice& device)
+CUcontext CCUDADeviceAccessor::getContext(const CCUDADevice& device)
 {
 	return SAccess::native(device).context;
 }
 
-size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size)
+size_t CCUDADeviceAccessor::roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size)
 {
 	const auto& granularity = SAccess::native(device).allocationGranularity[location];
 	return ((size - 1) / granularity + 1) * granularity;
@@ -90,7 +93,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 {
 	const auto handler = device.getHandler();
 	const auto& native = cuda_native::SAccess::native(device);
-	const auto& cu = cuda_native::getCUDAFunctionTable(*handler);
+	const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*handler);
 	
 	CUdeviceptr ptr = 0;
 	if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err)
@@ -98,7 +101,8 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 
 	if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err)
 	{
-		ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), handler);
+		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
+			assert(false);
 		return err;
 	}
 	
@@ -109,8 +113,10 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 
 	if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err)
 	{
-		ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), handler);
-		ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), handler);
+		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size)))
+			assert(false);
+		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
+			assert(false);
 		return err;
 	}
 
@@ -122,7 +128,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 namespace cuda_native
 {
 
-core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& inParams)
+core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADeviceAccessor::createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& inParams)
 {
 	const auto handler = device.getHandler();
 	auto& native = SAccess::native(device);
@@ -131,11 +137,11 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 	CCUDAExportableMemory::SCachedCreationParams params = {
 		.size = inParams.size,
 		.alignment = inParams.alignment,
-		.granularSize = roundToGranularity(device, inParams.location, inParams.size),
+		.granularSize = CCUDADeviceAccessor::roundToGranularity(device, inParams.location, inParams.size),
 		.deviceLocal = isDeviceLocal(inParams.location)
 	};
 
-	auto& cu = getCUDAFunctionTable(*handler);
+	auto& cu = CCUDAHandlerAccessor::getCUDAFunctionTable(*handler);
 	
 #ifdef _WIN32
 	OBJECT_ATTRIBUTES metadata = {
@@ -145,7 +151,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 
 	 const auto prop = CUmemAllocationProp{
 		.type = CU_MEM_ALLOCATION_TYPE_PINNED,
-		.requestedHandleTypes = getAllocationHandleType(),
+		.requestedHandleTypes = SAccess::allocationHandleType(),
 		.location = { .type = inParams.location, .id = native.handle },
 #ifdef _WIN32
 		.win32HandleMetaData = &metadata,
@@ -164,7 +170,8 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 	if (auto err = cu.pcuMemExportToShareableHandle(&params.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err)
 	{
 		logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR);
-		ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), handler);
+		if (!CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemRelease(mem)))
+			assert(false);
 		return nullptr;
 	}
 
@@ -172,7 +179,8 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 	{
 		logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
 
-		ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), handler);
+		if (!CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemRelease(mem)))
+			assert(false);
 
 		bool closeSucceed = CloseExternalHandle(params.externalHandle);
 		assert(closeSucceed);
@@ -194,7 +202,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice
 
 core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem)
 {
-	const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
+	const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler);
 	const auto handleType = mem->getCreationParams().externalHandleType;
 
 	if (!handleType) return nullptr;
@@ -225,7 +233,7 @@ core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(co
 
 core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sema)
 {
-	auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
+	auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler);
 	auto handleType = sema->getCreationParams().externalHandleTypes.value;
 
 	if (!handleType)
@@ -258,7 +266,8 @@ core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaph
 
 CCUDADevice::~CCUDADevice()
 {
-	ASSERT_CUDA_SUCCESS(cuda_native::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context), m_handler);
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context)))
+		assert(false);
 }
 
 }
diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp
index 4eb37b720a..f84169e38f 100644
--- a/src/nbl/video/CCUDAExportableMemory.cpp
+++ b/src/nbl/video/CCUDAExportableMemory.cpp
@@ -52,11 +52,13 @@ core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsM
 
 CCUDAExportableMemory::~CCUDAExportableMemory()
 {
-	const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
+	const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler());
 
-  ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_native->ptr, m_params.granularSize), m_device->getHandler());
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize)))
+		assert(false);
 
-	ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize), m_device->getHandler());
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize)))
+		assert(false);
 
   bool closeSucceed = CloseExternalHandle(m_params.externalHandle);
 	assert(closeSucceed);
@@ -66,7 +68,7 @@ CCUDAExportableMemory::~CCUDAExportableMemory()
 namespace cuda_native
 {
 
-CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory)
+CUdeviceptr CCUDAExportableMemoryAccessor::getDeviceptr(const CCUDAExportableMemory& memory)
 {
 	return SAccess::native(memory).ptr;
 }
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index ced76b9713..0064a191a6 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -355,7 +355,7 @@ CCUDAHandler::~CCUDAHandler() = default;
 namespace cuda_native
 {
 
-bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
+bool CCUDAHandlerAccessor::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
 {
 	switch (result)
 	{
@@ -721,12 +721,12 @@ bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
 	return false;
 }
 
-bool defaultHandleResult(const CCUDAHandler& handler, CUresult result)
+bool CCUDAHandlerAccessor::defaultHandleResult(const CCUDAHandler& handler, CUresult result)
 {
-	return defaultHandleResult(result,SAccess::logger(handler));
+	return CCUDAHandlerAccessor::defaultHandleResult(result,SAccess::logger(handler));
 }
 
-bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
+bool CCUDAHandlerAccessor::defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
 {
 	switch (result)
 	{
@@ -874,22 +874,22 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 namespace cuda_native
 {
 
-const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler)
+const CUDA& CCUDAHandlerAccessor::getCUDAFunctionTable(const CCUDAHandler& handler)
 {
 	return SAccess::native(handler).cuda;
 }
 
-const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler)
+const NVRTC& CCUDAHandlerAccessor::getNVRTCFunctionTable(const CCUDAHandler& handler)
 {
 	return SAccess::native(handler).nvrtc;
 }
 
-const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler)
+const core::vector<SCUDADeviceInfo>& CCUDAHandlerAccessor::getAvailableDevices(const CCUDAHandler& handler)
 {
 	return SAccess::native(handler).availableDevices;
 }
 
-nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
+nvrtcResult CCUDAHandlerAccessor::createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
 {
 #if defined(_NBL_WINDOWS_API_)
 	source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n");
@@ -901,24 +901,12 @@ nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string
 	return SAccess::native(handler).nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames);
 }
 
-nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount, const char* const* headerContents, const char* const* includeNames)
-{
-	const auto filesize = file->getSize();
-	std::string source(filesize+1u,'0');
-
-	system::IFile::success_t bytesRead;
-	file->read(bytesRead,source.data(),0u,file->getSize());
-	source.resize(bytesRead.getBytesProcessed());
-
-	return createProgram(handler,prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames);
-}
-
-nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options)
+nvrtcResult CCUDAHandlerAccessor::compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options)
 {
 	return SAccess::native(handler).nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin());
 }
 
-nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log)
+nvrtcResult CCUDAHandlerAccessor::getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log)
 {
 	size_t _size = 0ull;
 	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetProgramLogSize(prog, &_size);
@@ -931,7 +919,7 @@ nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::s
 	return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data());
 }
 
-ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
+ptx_and_nvrtcResult_t CCUDAHandlerAccessor::getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
 {
 	size_t _size = 0ull;
 	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size);
@@ -968,16 +956,16 @@ static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nv
 
 	const auto* optionsBegin = options.empty() ? nullptr:options.data();
 	const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size();
-	result = compileProgram(handler,program,{optionsBegin,optionsEnd});
+	result = CCUDAHandlerAccessor::compileProgram(handler,program,{optionsBegin,optionsEnd});
 	if (log)
-		getProgramLog(handler,program,*log);
+		CCUDAHandlerAccessor::getProgramLog(handler,program,*log);
 	if (result!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
-	return getPTX(handler,program);
+	return CCUDAHandlerAccessor::getPTX(handler,program);
 }
 
-ptx_and_nvrtcResult_t compileDirectlyToPTX(
+ptx_and_nvrtcResult_t CCUDAHandlerAccessor::compileDirectlyToPTX(
 	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
 	const int headerCount, const char* const* headerContents, const char* const* includeNames,
 	std::string* log)
@@ -990,24 +978,7 @@ ptx_and_nvrtcResult_t compileDirectlyToPTX(
 			SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program);
 	});
 
-	result = createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames);
-	return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log);
-}
-
-ptx_and_nvrtcResult_t compileDirectlyToPTX(
-	CCUDAHandler& handler, system::IFile* file, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount, const char* const* headerContents, const char* const* includeNames,
-	std::string* log)
-{
-	nvrtcProgram program = nullptr;
-	nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
-	auto cleanup = core::makeRAIIExiter([&]() -> void
-	{
-		if (result!=NVRTC_SUCCESS && program)
-			SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program);
-	});
-
-	result = createProgram(handler,&program,file,headerCount,headerContents,includeNames);
+	result = CCUDAHandlerAccessor::createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames);
 	return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log);
 }
 
diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp
index 9e58fbac10..9145fe18ac 100644
--- a/src/nbl/video/CCUDAImportedMemory.cpp
+++ b/src/nbl/video/CCUDAImportedMemory.cpp
@@ -21,18 +21,18 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> dev
 namespace cuda_native
 {
 
-CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory)
+CUexternalMemory CCUDAImportedMemoryAccessor::getInternalObject(const CCUDAImportedMemory& memory)
 {
   return SAccess::native(memory).handle;
 }
 
-CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer)
+CUresult CCUDAImportedMemoryAccessor::getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer)
 {
   CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {};
   bufferDesc.offset = 0;
   bufferDesc.size = SAccess::source(memory)->getAllocationSize();
 
-  const auto& cu = getCUDAFunctionTable(*SAccess::device(memory)->getHandler());
+  const auto& cu = CCUDAHandlerAccessor::getCUDAFunctionTable(*SAccess::device(memory)->getHandler());
   return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, SAccess::native(memory).handle, &bufferDesc);
   
 }
@@ -41,8 +41,9 @@ CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedB
 
 CCUDAImportedMemory::~CCUDAImportedMemory()
 {
-  auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
-  ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_native->handle), m_device->getHandler());
+  auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler());
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle)))
+		assert(false);
 }
 
 }
diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp
index bc1db625d1..5d7d3e07ae 100644
--- a/src/nbl/video/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/video/CCUDAImportedSemaphore.cpp
@@ -20,7 +20,7 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevic
 namespace cuda_native
 {
 
-CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore)
+CUexternalSemaphore CCUDAImportedSemaphoreAccessor::getInternalObject(const CCUDAImportedSemaphore& semaphore)
 {
 	return SAccess::native(semaphore).handle;
 }
@@ -29,8 +29,9 @@ CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore)
 
 CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
 {
-	auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
-	ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_native->handle), m_device->getHandler());
+	auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler());
+	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle)))
+		assert(false);
 }
 }
 
diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp
index 79139d015d..7e602bb0f3 100644
--- a/src/nbl/video/CUDAInteropNativeState.hpp
+++ b/src/nbl/video/CUDAInteropNativeState.hpp
@@ -57,15 +57,6 @@ struct CCUDAImportedSemaphore::SNativeState
 namespace cuda_native
 {
 
-inline CUmemAllocationHandleType getAllocationHandleType()
-{
-#ifdef _WIN32
-	return CU_MEM_HANDLE_TYPE_WIN32;
-#else
-	return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
-#endif
-}
-
 struct SAccess
 {
 	static CCUDAHandler::SNativeState& native(CCUDAHandler& handler) { return *handler.m_native; }
@@ -96,6 +87,14 @@ struct SAccess
 	static system::logger_opt_ptr logger(const CCUDADevice& device) { return device.m_logger; }
 	static const CCUDADevice* device(const CCUDAImportedMemory& memory) { return memory.m_device.get(); }
 	static IDeviceMemoryAllocation* source(const CCUDAImportedMemory& memory) { return memory.m_src.get(); }
+	static CUmemAllocationHandleType allocationHandleType()
+	{
+	#ifdef _WIN32
+		return CU_MEM_HANDLE_TYPE_WIN32;
+	#else
+		return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+	#endif
+	}
 };
 
 }

From 23e6ef5235ebf2b6f86694652f437b37b0479c53 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 16:36:31 +0200
Subject: [PATCH 23/51] Use explicit CUDA compile log

---
 examples_tests                                  |  2 +-
 include/nbl/ext/CUDAInterop/CUDAInteropNative.h |  9 ++++-----
 src/nbl/ext/CUDAInterop/README.md               |  7 ++++---
 src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp |  8 ++++----
 src/nbl/video/CCUDAHandler.cpp                  | 15 +++++++--------
 5 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/examples_tests b/examples_tests
index 1dc7f6a075..3c57a88af9 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 1dc7f6a075c8c457b80388e59ef3da846bad03e4
+Subproject commit 3c57a88af9eba722fcc6b5b5ba3d136ab3e166ca
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index 57669f591a..d409c774e1 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -155,7 +155,7 @@ struct SExportableMemoryCreationParams
 	CUmemLocationType location;
 };
 
-struct ptx_and_nvrtcResult_t
+struct SPTXResult
 {
 	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
 	nvrtcResult result;
@@ -175,11 +175,10 @@ class NBL_API2 CCUDAHandlerAccessor
 		static nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 		static nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
 		static nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
-		static ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
-		static ptx_and_nvrtcResult_t compileDirectlyToPTX(
+		static SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
+		static SPTXResult compileDirectlyToPTX(
 			CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-			const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr,
-			std::string* log=nullptr
+			std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
 		);
 };
 
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index ea92dcec7d..7d350da379 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -60,15 +60,15 @@ auto memory = nbl::video::cuda_native::CCUDADeviceAccessor::createExportableMemo
 
 std::string log;
 std::string cudaSource = loadKernelText();
-auto [ptx, result] = nbl::video::cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
+auto compile = nbl::video::cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
     *handler,
     std::move(cudaSource),
     "kernel.cu",
     cudaDevice->geDefaultCompileOptions(),
+    log,
     0,
     nullptr,
-    nullptr,
-    &log
+    nullptr
 );
 ```
 
@@ -78,6 +78,7 @@ Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC ty
 - `CCUDADeviceAccessor` exposes `CUdevice`, `CUcontext`, memory granularity, and CUDA allocation creation.
 - `CCUDAExportableMemoryAccessor`, `CCUDAImportedMemoryAccessor`, and `CCUDAImportedSemaphoreAccessor` expose the raw CUDA handles needed for interop.
 - Accessor methods take explicit Nabla references. Callers dereference `smart_refctd_ptr` at the call site instead of going through pointer/smart-pointer convenience overloads.
+- `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`. There is no optional output pointer in the public API.
 
 Smoke examples:
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index 0b07bfa137..ace1059215 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -95,17 +95,17 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler)
 	)cuda";
 
 	std::string log;
-	auto [ptx, result] = cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
+	auto compile = cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
 		handler,
 		std::string(Source),
 		"cuda_fp16_discovery_probe.cu",
 		{nullptr,nullptr},
+		log,
 		0,
 		nullptr,
-		nullptr,
-		&log
+		nullptr
 	);
-	return result==NVRTC_SUCCESS && ptx && ptx->getSize()>0u;
+	return compile.result==NVRTC_SUCCESS && compile.ptx && compile.ptx->getSize()>0u;
 }
 }
 
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 0064a191a6..9db99e7642 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -919,7 +919,7 @@ nvrtcResult CCUDAHandlerAccessor::getProgramLog(const CCUDAHandler& handler, nvr
 	return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data());
 }
 
-ptx_and_nvrtcResult_t CCUDAHandlerAccessor::getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
+SPTXResult CCUDAHandlerAccessor::getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
 {
 	size_t _size = 0ull;
 	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size);
@@ -941,8 +941,9 @@ static const core::vector<std::string>& getDefaultRuntimeIncludeOptions()
 	return RuntimeIncludeOptions;
 }
 
-static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
+static SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string& log)
 {
+	log.clear();
 	if (result!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
@@ -957,24 +958,22 @@ static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nv
 	const auto* optionsBegin = options.empty() ? nullptr:options.data();
 	const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size();
 	result = CCUDAHandlerAccessor::compileProgram(handler,program,{optionsBegin,optionsEnd});
-	if (log)
-		CCUDAHandlerAccessor::getProgramLog(handler,program,*log);
+	CCUDAHandlerAccessor::getProgramLog(handler,program,log);
 	if (result!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
 	return CCUDAHandlerAccessor::getPTX(handler,program);
 }
 
-ptx_and_nvrtcResult_t CCUDAHandlerAccessor::compileDirectlyToPTX(
+SPTXResult CCUDAHandlerAccessor::compileDirectlyToPTX(
 	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	const int headerCount, const char* const* headerContents, const char* const* includeNames,
-	std::string* log)
+	std::string& log, const int headerCount, const char* const* headerContents, const char* const* includeNames)
 {
 	nvrtcProgram program = nullptr;
 	nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
 	auto cleanup = core::makeRAIIExiter([&]() -> void
 	{
-		if (result!=NVRTC_SUCCESS && program)
+		if (program)
 			SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program);
 	});
 

From a640183dbc6229f3b9b60c1d22bb1c50c7b8e5fe Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 17:05:26 +0200
Subject: [PATCH 24/51] Trim CUDA interop API surface

---
 cmake/common.cmake                            | 19 +++-------------
 include/nbl/core/decl/smart_refctd_ptr.h      |  1 +
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   |  4 ++--
 include/nbl/video/CCUDADevice.h               |  2 --
 include/nbl/video/CCUDAHandler.h              | 12 ++++++----
 src/nbl/ext/CUDAInterop/README.md             | 11 +++++++---
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |  5 ++++-
 src/nbl/video/CCUDAHandler.cpp                | 22 ++++---------------
 8 files changed, 30 insertions(+), 46 deletions(-)

diff --git a/cmake/common.cmake b/cmake/common.cmake
index ae2264fda4..c50e1f6fb2 100755
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -284,22 +284,9 @@ function(nbl_install_dir _DIR)
 endfunction()
 
 function(nbl_install_lib_spec _TARGETS _RELATIVE_DESTINATION)
-	cmake_parse_arguments(_NBL_INSTALL_LIB "" "EXPORT" "" ${ARGN})
-	if(_NBL_INSTALL_LIB_UNPARSED_ARGUMENTS)
-		message(FATAL_ERROR "Unexpected arguments for nbl_install_lib_spec: ${_NBL_INSTALL_LIB_UNPARSED_ARGUMENTS}")
-	endif()
-
-	if(_NBL_INSTALL_LIB_EXPORT)
-		install(TARGETS ${_TARGETS}
-			EXPORT ${_NBL_INSTALL_LIB_EXPORT}
-			ARCHIVE DESTINATION ${_NBL_CPACK_PACKAGE_RELATIVE_ENTRY_}/lib/${_RELATIVE_DESTINATION}
-			COMPONENT Libraries
-		)
-	else()
-		install(TARGETS ${_TARGETS} ARCHIVE DESTINATION lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Release COMPONENT Libraries)
-		install(TARGETS ${_TARGETS} ARCHIVE DESTINATION debug/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Debug COMPONENT Libraries)
-		install(TARGETS ${_TARGETS} ARCHIVE DESTINATION relwithdebinfo/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS RelWithDebInfo COMPONENT Libraries)
-	endif()
+	install(TARGETS ${_TARGETS} ARCHIVE DESTINATION lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Release COMPONENT Libraries)
+	install(TARGETS ${_TARGETS} ARCHIVE DESTINATION debug/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Debug COMPONENT Libraries)
+	install(TARGETS ${_TARGETS} ARCHIVE DESTINATION relwithdebinfo/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS RelWithDebInfo COMPONENT Libraries)
 endfunction()
 
 function(nbl_install_lib _TARGETS)
diff --git a/include/nbl/core/decl/smart_refctd_ptr.h b/include/nbl/core/decl/smart_refctd_ptr.h
index 814c807a84..7c231fea4b 100644
--- a/include/nbl/core/decl/smart_refctd_ptr.h
+++ b/include/nbl/core/decl/smart_refctd_ptr.h
@@ -118,6 +118,7 @@ class smart_refctd_ptr
 };
 static_assert(sizeof(smart_refctd_ptr<IReferenceCounted>) == sizeof(IReferenceCounted*), "smart_refctd_ptr has a memory overhead!");
 
+
 template< class T, class... Args >
 smart_refctd_ptr<T> make_smart_refctd_ptr(Args&& ... args);
 
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index d409c774e1..daf3dcb4d1 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -161,8 +161,8 @@ struct SPTXResult
 	nvrtcResult result;
 };
 
-// These are opt-in CUDA-native declarations for symbols implemented and exported by Nabla.
-// Only consumers that include this header and link Nabla::ext::CUDAInterop see CUDA SDK types.
+// Opt-in native CUDA API. The declarations below are implemented by the Nabla library.
+// This header is intentionally the only public path that includes CUDA SDK types.
 class NBL_API2 CCUDAHandlerAccessor
 {
 	public:
diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index bc1931e363..7c1d1f272b 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -89,8 +89,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 		struct SNativeState;
 		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr<SNativeState>&& nativeState, core::smart_refctd_ptr<CCUDAHandler>&& handler);
 
-		static constexpr auto CudaMemoryLocationCount = 5;
-
 		const system::logger_opt_ptr m_logger;
 		std::vector<const char*> m_defaultCompileOptions;
 		core::smart_refctd_ptr<CVulkanConnection> m_vulkanConnection;
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index f6b5d578a8..bb2d12c637 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -33,12 +33,17 @@ inline constexpr const char* RuntimePathsFileName = "nbl_cuda_interop_runtime.js
 struct SRuntimeCompileEnvironment
 {
 	core::vector<system::path> includeDirs;
-	core::vector<system::path> runtimePathFiles;
 };
 
 NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs = {});
 NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs, core::vector<system::path> runtimePathFiles);
-NBL_API2 core::vector<std::string> makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment);
+inline core::vector<std::string> makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment)
+{
+	core::vector<std::string> options;
+	for (const auto& includeDir : environment.includeDirs)
+		options.push_back("-I" + includeDir.generic_string());
+	return options;
+}
 }
 
 class NBL_API2 CCUDAHandler : public core::IReferenceCounted
@@ -73,7 +78,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 		friend struct cuda_native::SAccess;
 
 		struct SNativeState;
-		CCUDAHandler(std::unique_ptr<SNativeState>&& nativeState, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger, int _version);
+		CCUDAHandler(std::unique_ptr<SNativeState>&& nativeState, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger);
 
 		std::unique_ptr<SNativeState> m_native;
 		core::vector<SCUDADeviceInfo> m_availableDevices;
@@ -82,7 +87,6 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 		core::vector<std::string> m_headerNamesStorage;
 		core::vector<const char*> m_headerNames;
 		system::logger_opt_smart_ptr m_logger;
-		int m_version;
 };
 
 }
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 7d350da379..fb9896e30e 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -93,11 +93,12 @@ Smoke examples:
 - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state.
 - `CUDAInteropNative.h` declares exported accessor classes whose definitions still live in `Nabla.dll`. The opt-in header owns only the CUDA SDK surface. Nabla owns the implementation and ABI.
 - Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small Nabla-owned parameter structs.
+- Runtime include-option construction is header-only and is not part of the exported ABI.
 - A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. The loaded driver and NVRTC runtime are still validated at runtime.
 
 ## Runtime Header Discovery
 
-NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`.
+NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`. This is a runtime concern of applications that compile CUDA source with NVRTC, not a default `Nabla::Nabla` package requirement.
 
 - `nbl_target_link_cuda_interop` generates `nbl_cuda_interop_runtime.json` for the target that opted into native CUDA interop.
 - The JSON is a build artifact. Nabla packages do not install host-specific CUDA paths.
@@ -105,7 +106,7 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud
 - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
 - Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
 - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths.
-- `cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX` appends discovered include directories to NVRTC options and caches the default discovery result after first use.
+- `cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Default discovery is cached after the first call.
 
 Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit.
 
@@ -114,7 +115,11 @@ Nabla does not ship CUDA runtime headers by default. NVIDIA CUDA EULA allows red
 - https://docs.nvidia.com/cuda/eula/#distribution
 - https://docs.nvidia.com/cuda/eula/#attachment-a
 
-Attachment A includes header groups relevant to NVRTC runtime compilation, including `nvrtc.h`, `cuda_fp16.h`, `cuda_bf16.h`, `cuda_fp8.h`, `cuda_fp6.h`, `cuda_fp4.h`, `cuda_runtime_api.h`, `cuda.h`, `vector_functions.h`, and `vector_types.h`.
+Attachment A lists header groups relevant to NVRTC runtime compilation:
+
+- NVIDIA Runtime Compilation Library and Header: `nvrtc.h`
+- CUDA Floating Point Type Headers: `cuda_fp16.h`, `cuda_fp16.hpp`, `cuda_bf16.h`, `cuda_bf16.hpp`, `cuda_fp8.h`, `cuda_fp8.hpp`, `cuda_fp6.h`, `cuda_fp6.hpp`, `cuda_fp4.h`, `cuda_fp4.hpp`
+- CUDA Headers for Runtime Compilation: `crt/host_defines.h`, `cuComplex.h`, `cuda_awbarrier_helpers.h`, `cuda_awbarrier_primitives.h`, `cuda_awbarrier.h`, `cuda_pipeline_helpers.h`, `cuda_pipeline_primitives.h`, `cuda_pipeline.h`, `cuda_runtime_api.h`, `cuda.h`, `cuda/std/tuple`, `cuda/std/type_traits`, `cuda/std/utility`, `device_types.h`, `vector_functions.h`, and `vector_types.h`
 
 CuPy documents the same NVRTC issue for CUDA 12.2+. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They show the common `vector_types.h` failure and recommend CUDA runtime header packages for PyPI/system package installs:
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index ace1059215..5d35ec8bed 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -53,11 +53,13 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 		return false;
 
 	CUcontext poppedContext = nullptr;
+	bool contextPushed = false;
 	auto releaseContext = [&]()
 	{
 		if (context)
 		{
-			cuda.pcuCtxPopCurrent_v2(&poppedContext);
+			if (contextPushed)
+				cuda.pcuCtxPopCurrent_v2(&poppedContext);
 			cuda.pcuDevicePrimaryCtxRelease_v2(device);
 		}
 	};
@@ -67,6 +69,7 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 		releaseContext();
 		return false;
 	}
+	contextPushed = true;
 
 	constexpr std::array<uint32_t, 4> input = {0x12345678u, 0x90abcdefu, 0xfedcba09u, 0x87654321u};
 	std::array<uint32_t, input.size()> output = {};
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 9db99e7642..22ed5d0eb3 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -3,7 +3,6 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 #include "nbl/video/CUDAInterop.h"
-#include "nbl/system/ModuleLookupUtils.h"
 
 #include "nlohmann/json.hpp"
 
@@ -253,11 +252,10 @@ void appendSystemIncludeDirs(core::vector<system::path>& includeDirs)
 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs, core::vector<system::path> runtimePathFiles)
 {
 	SRuntimeCompileEnvironment environment;
-	environment.runtimePathFiles = std::move(runtimePathFiles);
 	for (auto& includeDir : explicitIncludeDirs)
 		appendIncludeDir(environment.includeDirs,std::move(includeDir));
 
-	appendRuntimePathsConfigs(environment.includeDirs,environment.runtimePathFiles);
+	appendRuntimePathsConfigs(environment.includeDirs,runtimePathFiles);
 	appendAppLocalIncludeDirs(environment.includeDirs);
 	appendEnvironmentIncludeDirs(environment.includeDirs);
 	appendSystemIncludeDirs(environment.includeDirs);
@@ -270,14 +268,6 @@ SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::pa
 	return findRuntimeCompileEnvironment(std::move(explicitIncludeDirs),{});
 }
 
-core::vector<std::string> makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment)
-{
-	core::vector<std::string> options;
-	for (const auto& includeDir : environment.includeDirs)
-		options.push_back("-I" + includeDir.generic_string());
-	return options;
-}
-
 }
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
@@ -307,12 +297,10 @@ int cudaVersionMinor(int version)
 CCUDAHandler::CCUDAHandler(
 	std::unique_ptr<SNativeState>&& nativeState,
 	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, 
-	core::smart_refctd_ptr<system::ILogger>&& _logger,
-	int _version)
+	core::smart_refctd_ptr<system::ILogger>&& _logger)
 	: m_native(std::move(nativeState))
 	, m_headers(std::move(_headers))
 	, m_logger(std::move(_logger))
-	, m_version(_version)
 {
 	assert(m_native);
 
@@ -866,7 +854,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 	}
 
 	return core::smart_refctd_ptr<CCUDAHandler>(
-		new CCUDAHandler(std::make_unique<SNativeState>(std::move(cuda),std::move(nvrtc)),std::move(headers),std::move(_logger),cudaVersion),
+		new CCUDAHandler(std::make_unique<SNativeState>(std::move(cuda),std::move(nvrtc)),std::move(headers),std::move(_logger)),
 		core::dont_grab
 	);
 }
@@ -1097,12 +1085,10 @@ struct CCUDAHandler::SNativeState {};
 CCUDAHandler::CCUDAHandler(
 	std::unique_ptr<SNativeState>&& nativeState,
 	core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers,
-	core::smart_refctd_ptr<system::ILogger>&& _logger,
-	int _version)
+	core::smart_refctd_ptr<system::ILogger>&& _logger)
 	: m_native(std::move(nativeState))
 	, m_headers(std::move(_headers))
 	, m_logger(std::move(_logger))
-	, m_version(_version)
 {
 	assert(m_native);
 }

From 5bf0e2d9c70280851f6779ce6a25b853f1730829 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 17:24:10 +0200
Subject: [PATCH 25/51] Keep CUDA SDK layouts private

---
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h    |  1 -
 src/nbl/ext/CUDAInterop/README.md              |  7 +++++--
 src/nbl/video/CCUDAHandler.cpp                 | 18 +++++++++---------
 src/nbl/video/CUDAInteropNativeState.hpp       |  7 +++++++
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index daf3dcb4d1..6d142c6b3f 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -145,7 +145,6 @@ struct SCUDADeviceInfo
 {
 	CUdevice handle = {};
 	CUuuid uuid = {};
-	int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {};
 };
 
 struct SExportableMemoryCreationParams
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index fb9896e30e..d60b15639a 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -44,6 +44,8 @@ Consumers can also choose the SDK used for native compilation with:
 cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>
 ```
 
+This affects native opt-in compilation and generated runtime header discovery only. It does not rebuild Nabla and does not change the `Nabla.dll` ABI.
+
 ## Native Usage
 
 ```cpp
@@ -92,9 +94,10 @@ Smoke examples:
 - Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes.
 - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state.
 - `CUDAInteropNative.h` declares exported accessor classes whose definitions still live in `Nabla.dll`. The opt-in header owns only the CUDA SDK surface. Nabla owns the implementation and ABI.
-- Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small Nabla-owned parameter structs.
+- Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small fixed-layout parameter/result structs.
+- SDK-sized arrays and other layouts derived from CUDA SDK constants stay private to Nabla. A consumer can build native opt-in code with its own compatible SDK independently from the SDK used to build Nabla.
 - Runtime include-option construction is header-only and is not part of the exported ABI.
-- A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. The loaded driver and NVRTC runtime are still validated at runtime.
+- The loaded CUDA driver and NVRTC runtime are validated at runtime.
 
 ## Runtime Header Discovery
 
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 22ed5d0eb3..78434d9bd5 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -325,15 +325,15 @@ CCUDAHandler::CCUDAHandler(
 		if (m_native->cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS)
 			continue;
 
-		auto& nativeDevice = m_native->availableDevices.emplace_back();
-		nativeDevice.handle = handle;
-		nativeDevice.uuid = uuid;
+		auto& nativeDevice = m_native->deviceStates.emplace_back();
+		nativeDevice.info.handle = handle;
+		nativeDevice.info.uuid = uuid;
+		m_native->availableDevices.push_back(nativeDevice.info);
 		auto& cleanDevice = m_availableDevices.emplace_back();
 		memcpy(cleanDevice.uuid.data(),&uuid,cleanDevice.uuid.size());
 
-		int* attributes = nativeDevice.attributes;
-		for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++)
-			m_native->cuda.pcuDeviceGetAttribute(attributes + i, static_cast<CUdevice_attribute>(i), handle);
+		for (size_t i = 0; i < nativeDevice.attributes.size(); i++)
+			m_native->cuda.pcuDeviceGetAttribute(&nativeDevice.attributes[i], static_cast<CUdevice_attribute>(i), handle);
 
 	}
 }
@@ -979,9 +979,9 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 	if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end())
 		return nullptr;
 
-	for (const auto& device : m_native->availableDevices)
+	for (const auto& device : m_native->deviceStates)
 	{
-		if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE))
+		if (!memcmp(&device.info.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE))
 		{
 			CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT;
 			const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR];
@@ -1064,7 +1064,7 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 				continue;
 
 			return core::smart_refctd_ptr<CCUDADevice>(
-				new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique<CCUDADevice::SNativeState>(device.handle),core::smart_refctd_ptr<CCUDAHandler>(this)),
+				new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique<CCUDADevice::SNativeState>(device.info.handle),core::smart_refctd_ptr<CCUDAHandler>(this)),
 				core::dont_grab
 			);
 		}
diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp
index 7e602bb0f3..4be8178aa2 100644
--- a/src/nbl/video/CUDAInteropNativeState.hpp
+++ b/src/nbl/video/CUDAInteropNativeState.hpp
@@ -10,9 +10,16 @@ namespace nbl::video
 
 struct CCUDAHandler::SNativeState
 {
+	struct SDeviceState
+	{
+		cuda_native::SCUDADeviceInfo info = {};
+		std::array<int,CU_DEVICE_ATTRIBUTE_MAX> attributes = {};
+	};
+
 	cuda_native::CUDA cuda;
 	cuda_native::NVRTC nvrtc;
 	core::vector<cuda_native::SCUDADeviceInfo> availableDevices;
+	core::vector<SDeviceState> deviceStates;
 
 	SNativeState(cuda_native::CUDA&& _cuda, cuda_native::NVRTC&& _nvrtc)
 		: cuda(std::move(_cuda))

From d745421cc25114adf5664fb778e873db8e8f5c7a Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 17:51:35 +0200
Subject: [PATCH 26/51] Simplify CUDA interop helper

---
 cmake/NablaCUDAInteropHelpers.cmake | 190 +++-------------------------
 src/nbl/ext/CUDAInterop/README.md   |   9 +-
 2 files changed, 26 insertions(+), 173 deletions(-)

diff --git a/cmake/NablaCUDAInteropHelpers.cmake b/cmake/NablaCUDAInteropHelpers.cmake
index 9c1ac657d4..e84b2d1a8e 100644
--- a/cmake/NablaCUDAInteropHelpers.cmake
+++ b/cmake/NablaCUDAInteropHelpers.cmake
@@ -1,182 +1,28 @@
-function(_nbl_cuda_interop_collect_runtime_include_dirs _OUT_INCLUDE_DIRS)
-	set(_include_dirs ${ARGN})
-
-	if(DEFINED CUDAToolkit_INCLUDE_DIRS AND NOT "${CUDAToolkit_INCLUDE_DIRS}" STREQUAL "")
-		list(APPEND _include_dirs ${CUDAToolkit_INCLUDE_DIRS})
+function(nbl_target_link_cuda_interop TARGET_NAME SCOPE)
+	if(NOT SCOPE MATCHES "^(PRIVATE|PUBLIC|INTERFACE)$")
+		set(SCOPE PRIVATE)
 	endif()
-
-	if(TARGET CUDA::toolkit)
-		get_target_property(_cuda_toolkit_include_dirs CUDA::toolkit INTERFACE_INCLUDE_DIRECTORIES)
-		if(_cuda_toolkit_include_dirs AND NOT _cuda_toolkit_include_dirs STREQUAL "NOTFOUND")
-			list(APPEND _include_dirs ${_cuda_toolkit_include_dirs})
-		endif()
-	endif()
-
-	if(_include_dirs)
-		list(REMOVE_DUPLICATES _include_dirs)
-	endif()
-
-	set(${_OUT_INCLUDE_DIRS} ${_include_dirs} PARENT_SCOPE)
-endfunction()
-
-function(_nbl_cuda_interop_make_runtime_paths_json _OUT_CONTENT)
-	set(_include_dirs ${ARGN})
-	set(_cuda_runtime_include_dir_entries "")
-
-	foreach(_include_dir IN LISTS _include_dirs)
-		if("${_include_dir}" STREQUAL "")
-			continue()
+	cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${ARGN})
+	target_link_libraries("${TARGET_NAME}" ${SCOPE} Nabla::ext::CUDAInterop)
+	set(_include_dir_entries "")
+	foreach(_include_dir IN LISTS _NBL_CUDA_INTEROP_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS)
+		if(_include_dir)
+			file(TO_CMAKE_PATH "${_include_dir}" _include_dir)
+			list(APPEND _include_dir_entries "    \"${_include_dir}\"")
 		endif()
-
-		file(TO_CMAKE_PATH "${_include_dir}" _include_dir_json)
-		string(REPLACE "\"" "\\\"" _include_dir_json "${_include_dir_json}")
-
-		list(APPEND _cuda_runtime_include_dir_entries "    \"${_include_dir_json}\"")
 	endforeach()
-
-	set(_json_entry_separator [=[
-,
-]=])
-	list(JOIN _cuda_runtime_include_dir_entries "${_json_entry_separator}" _cuda_runtime_include_dirs)
-
-	set(_json [=[
+	list(JOIN _include_dir_entries "," _include_dirs_json)
+	set(_runtime_json [=[
 {
   "cudaRuntimeIncludeDirs": [
-@_cuda_runtime_include_dirs@
+@_include_dirs_json@
   ]
 }
 ]=])
-	string(CONFIGURE "${_json}" _json @ONLY)
-	set(${_OUT_CONTENT} "${_json}" PARENT_SCOPE)
-endfunction()
-
-function(_nbl_cuda_interop_collect_configs _OUT_CONFIGS)
-	if(CMAKE_CONFIGURATION_TYPES)
-		set(_configs ${CMAKE_CONFIGURATION_TYPES})
-	elseif(CMAKE_BUILD_TYPE)
-		set(_configs "${CMAKE_BUILD_TYPE}")
-	else()
-		set(_configs Debug)
-	endif()
-
-	list(REMOVE_DUPLICATES _configs)
-	set(${_OUT_CONFIGS} ${_configs} PARENT_SCOPE)
-endfunction()
-
-function(_nbl_cuda_interop_collect_target_runtime_jsons TARGET_NAME _OUT_FILES _OVERRIDE_OUTPUT)
-	_nbl_cuda_interop_collect_configs(_configs)
-	set(_runtime_jsons "")
-
-	if(NOT "${_OVERRIDE_OUTPUT}" STREQUAL "")
-		foreach(_config IN LISTS _configs)
-			set(_runtime_paths_json "${_OVERRIDE_OUTPUT}")
-			string(REPLACE "$<CONFIG>" "${_config}" _runtime_paths_json "${_runtime_paths_json}")
-			if(_runtime_paths_json MATCHES "\\$<")
-				message(FATAL_ERROR "Nabla: CUDA interop runtime JSON path supports only plain paths or $<CONFIG>.")
-			endif()
-			cmake_path(IS_ABSOLUTE _runtime_paths_json _is_abs)
-			if(NOT _is_abs)
-				cmake_path(ABSOLUTE_PATH _runtime_paths_json BASE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" OUTPUT_VARIABLE _runtime_paths_json)
-			endif()
-			cmake_path(NORMAL_PATH _runtime_paths_json OUTPUT_VARIABLE _runtime_paths_json)
-			list(APPEND _runtime_jsons "${_runtime_paths_json}")
-		endforeach()
-		list(REMOVE_DUPLICATES _runtime_jsons)
-		set(${_OUT_FILES} ${_runtime_jsons} PARENT_SCOPE)
-		return()
-	endif()
-
-	foreach(_config IN LISTS _configs)
-		string(TOUPPER "${_config}" _config_upper)
-		get_target_property(_runtime_output_dir "${TARGET_NAME}" "RUNTIME_OUTPUT_DIRECTORY_${_config_upper}")
-
-		if(NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND")
-			get_target_property(_runtime_output_dir "${TARGET_NAME}" RUNTIME_OUTPUT_DIRECTORY)
-		endif()
-		if((NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") AND DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY_${_config_upper})
-			set(_runtime_output_dir "${CMAKE_RUNTIME_OUTPUT_DIRECTORY_${_config_upper}}")
-		endif()
-		if((NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") AND DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY)
-			set(_runtime_output_dir "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
-		endif()
-		if(NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND")
-			if(CMAKE_CONFIGURATION_TYPES)
-				set(_runtime_output_dir "${CMAKE_CURRENT_BINARY_DIR}/${_config}")
-			else()
-				set(_runtime_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
-			endif()
-		endif()
-
-		string(REPLACE "$<CONFIG>" "${_config}" _runtime_output_dir "${_runtime_output_dir}")
-		if(_runtime_output_dir MATCHES "\\$<")
-			message(FATAL_ERROR "Nabla: nbl_configure_cuda_interop_runtime supports only plain runtime output directories or $<CONFIG>.")
-		endif()
-
-		cmake_path(IS_ABSOLUTE _runtime_output_dir _is_abs)
-		if(NOT _is_abs)
-			cmake_path(ABSOLUTE_PATH _runtime_output_dir BASE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" OUTPUT_VARIABLE _runtime_output_dir)
-		endif()
-		cmake_path(NORMAL_PATH _runtime_output_dir OUTPUT_VARIABLE _runtime_output_dir)
-
-		list(APPEND _runtime_jsons "${_runtime_output_dir}/nbl_cuda_interop_runtime.json")
-	endforeach()
-
-	list(REMOVE_DUPLICATES _runtime_jsons)
-	set(${_OUT_FILES} ${_runtime_jsons} PARENT_SCOPE)
-endfunction()
-
-function(nbl_configure_cuda_interop_runtime TARGET_NAME)
-	cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${ARGN})
-
-	if(_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS)
-		message(FATAL_ERROR "Nabla: unexpected arguments for nbl_configure_cuda_interop_runtime: ${_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS}")
-	endif()
-
-	if(NOT TARGET "${TARGET_NAME}")
-		message(FATAL_ERROR "Nabla: target \"${TARGET_NAME}\" does not exist")
-	endif()
-
-	_nbl_cuda_interop_collect_runtime_include_dirs(_include_dirs ${_NBL_CUDA_INTEROP_INCLUDE_DIRS})
-
-	_nbl_cuda_interop_make_runtime_paths_json(_runtime_paths_json_content ${_include_dirs})
-	_nbl_cuda_interop_collect_target_runtime_jsons("${TARGET_NAME}" _runtime_paths_jsons "${_NBL_CUDA_INTEROP_RUNTIME_JSON}")
-
-	foreach(_runtime_paths_json IN LISTS _runtime_paths_jsons)
-		file(GENERATE OUTPUT "${_runtime_paths_json}" CONTENT "${_runtime_paths_json_content}" TARGET "${TARGET_NAME}")
-	endforeach()
-
-	set_source_files_properties(${_runtime_paths_jsons} PROPERTIES GENERATED TRUE HEADER_FILE_ONLY TRUE)
-	target_sources("${TARGET_NAME}" PRIVATE ${_runtime_paths_jsons})
-endfunction()
-
-function(nbl_target_link_cuda_interop TARGET_NAME)
-	set(_args ${ARGN})
-	set(_scope PRIVATE)
-
-	if(_args)
-		list(GET _args 0 _first_arg)
-		if(_first_arg MATCHES "^(PRIVATE|PUBLIC|INTERFACE)$")
-			set(_scope "${_first_arg}")
-			list(REMOVE_AT _args 0)
-		endif()
-	endif()
-
-	cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${_args})
-
-	if(_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS)
-		message(FATAL_ERROR "Nabla: unexpected arguments for nbl_target_link_cuda_interop: ${_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS}")
-	endif()
-
-	if(NOT TARGET "${TARGET_NAME}")
-		message(FATAL_ERROR "Nabla: target \"${TARGET_NAME}\" does not exist")
-	endif()
-	if(NOT TARGET Nabla::ext::CUDAInterop)
-		message(FATAL_ERROR "Nabla: Nabla::ext::CUDAInterop is not available. Request the CUDAInterop package component or enable NBL_COMPILE_WITH_CUDA.")
+	string(CONFIGURE "${_runtime_json}" _runtime_json @ONLY)
+	set(_runtime_json_path "$<TARGET_FILE_DIR:${TARGET_NAME}>/nbl_cuda_interop_runtime.json")
+	if(_NBL_CUDA_INTEROP_RUNTIME_JSON)
+		set(_runtime_json_path "${_NBL_CUDA_INTEROP_RUNTIME_JSON}")
 	endif()
-
-	target_link_libraries("${TARGET_NAME}" ${_scope} Nabla::ext::CUDAInterop)
-	nbl_configure_cuda_interop_runtime("${TARGET_NAME}"
-		RUNTIME_JSON "${_NBL_CUDA_INTEROP_RUNTIME_JSON}"
-		INCLUDE_DIRS ${_NBL_CUDA_INTEROP_INCLUDE_DIRS}
-	)
+	file(GENERATE OUTPUT "${_runtime_json_path}" CONTENT "${_runtime_json}" TARGET "${TARGET_NAME}")
 endfunction()
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index d60b15639a..2ce46cbc93 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -113,11 +113,15 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud
 
 Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit.
 
-Nabla does not ship CUDA runtime headers by default. NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See:
+Nabla could ship an app-local bundle of selected CUDA runtime headers and make it available to runtime discovery. That model is allowed by the NVIDIA CUDA EULA for the components listed in Attachment A. Nabla intentionally does not bundle these headers. Because of that, end users should prefer an official CUDA runtime/header package for production machines. An installed toolkit also works, but the full toolkit is mainly for developers compiling Nabla or native CUDA code.
+
+NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See:
 
 - https://docs.nvidia.com/cuda/eula/#distribution
 - https://docs.nvidia.com/cuda/eula/#attachment-a
 
+This means the Attachment A header groups below can be redistributed with applications under the EULA terms. It does not mean the full CUDA SDK can be redistributed. Applications that need NVRTC runtime compilation can decide whether to ship the allowed headers, depend on an official runtime/header package, or point discovery at an installed toolkit/header package.
+
 Attachment A lists header groups relevant to NVRTC runtime compilation:
 
 - NVIDIA Runtime Compilation Library and Header: `nvrtc.h`
@@ -144,3 +148,6 @@ The split follows the same boundary pattern used by mature GPU projects: default
 - OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61
 - Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27
 - Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30
+- OpenMM keeps the CUDA platform boundary on OpenMM types/properties in `CudaPlatform.h`, while `CudaContext.h` is the CUDA-specific low-level header that includes CUDA SDK headers and exposes `CUmodule` / `CUfunction`: https://github.com/openmm/openmm/blob/master/platforms/cuda/include/CudaPlatform.h#L48-L120 and https://github.com/openmm/openmm/blob/master/platforms/cuda/include/CudaContext.h#L32-L52
+- GROMACS gates CUDA source handling behind `GMX_GPU_CUDA` in the library build and keeps CUDA runtime types in internal GPU utility headers: https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/CMakeLists.txt#L339-L367 and https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/gpu_utils/gputraits.cuh#L44-L58
+- ONNX Runtime keeps the public C API provider-neutral and routes CUDA through provider-specific bridge/factory code: https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_c_api.h#L1-L80 and https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/session/provider_bridge_ort.cc#L110-L150

From ffba3d48d4ac5fd7f26ed324c310f338328572af Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 18:12:36 +0200
Subject: [PATCH 27/51] Update CUDA interop examples pointer

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 3c57a88af9..7b5817a6d4 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 3c57a88af9eba722fcc6b5b5ba3d136ab3e166ca
+Subproject commit 7b5817a6d45c62a70fbe617022b6026a83939ff5

From 745f1b9166c457f538c67587cc9965280338884d Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Fri, 8 May 2026 17:07:54 +0200
Subject: [PATCH 28/51] Use opaque CUDA interop boundary

---
 examples_tests                                |   2 +-
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 114 ++++++++++++------
 include/nbl/video/CCUDADevice.h               |  13 ++
 include/nbl/video/CCUDAExportableMemory.h     |   2 +
 include/nbl/video/CCUDAHandler.h              |   1 +
 include/nbl/video/CCUDAImportedMemory.h       |   3 +
 include/nbl/video/CCUDAImportedSemaphore.h    |   2 +
 include/nbl/video/CUDAInterop.h               |   1 +
 include/nbl/video/CUDAInteropHandles.h        |  40 ++++++
 src/nbl/ext/CUDAInterop/README.md             |  49 ++++----
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |  22 ++--
 src/nbl/video/CCUDADevice.cpp                 |  99 ++++++++-------
 src/nbl/video/CCUDAExportableMemory.cpp       |  19 +--
 src/nbl/video/CCUDAHandler.cpp                |  42 ++++---
 src/nbl/video/CCUDAImportedMemory.cpp         |  39 ++++--
 src/nbl/video/CCUDAImportedSemaphore.cpp      |  18 +--
 16 files changed, 309 insertions(+), 157 deletions(-)
 create mode 100644 include/nbl/video/CUDAInteropHandles.h

diff --git a/examples_tests b/examples_tests
index 7b5817a6d4..2d415af102 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 7b5817a6d45c62a70fbe617022b6026a83939ff5
+Subproject commit 2d415af102ebf710ea2bb369b3f0eca5544652f7
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index 6d142c6b3f..495f3cabc0 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -13,6 +13,7 @@
 
 #include "cuda.h"
 #include "nvrtc.h"
+#include <type_traits>
 #if CUDA_VERSION < 13000
 	#error "Need CUDA 13.0 SDK or higher."
 #endif
@@ -160,54 +161,91 @@ struct SPTXResult
 	nvrtcResult result;
 };
 
-// Opt-in native CUDA API. The declarations below are implemented by the Nabla library.
-// This header is intentionally the only public path that includes CUDA SDK types.
-class NBL_API2 CCUDAHandlerAccessor
+template<typename Opaque, typename Native>
+concept cuda_opaque_handle =
+	std::is_trivially_copyable_v<Opaque> &&
+	std::is_trivially_copyable_v<Native> &&
+	sizeof(Opaque)==sizeof(Native) &&
+	alignof(Opaque)==alignof(Native);
+
+template<typename Opaque>
+struct SOpaqueCUDAType;
+
+template<> struct SOpaqueCUDAType<cuda_interop::SCUdevice> { using type = CUdevice; };
+template<> struct SOpaqueCUDAType<cuda_interop::SCUcontext> { using type = CUcontext; };
+template<> struct SOpaqueCUDAType<cuda_interop::SCUdeviceptr> { using type = CUdeviceptr; };
+template<> struct SOpaqueCUDAType<cuda_interop::SCUexternalMemory> { using type = CUexternalMemory; };
+template<> struct SOpaqueCUDAType<cuda_interop::SCUexternalSemaphore> { using type = CUexternalSemaphore; };
+
+template<typename Opaque>
+struct SNativeHandle
 {
-	public:
-		static const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
-		static const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
-		static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
-		static bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
-		static bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
-		static const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
-		static nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
-		static nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
-		static nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
-		static SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
-		static SPTXResult compileDirectlyToPTX(
-			CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-			std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
-		);
+	using cuda_t = typename SOpaqueCUDAType<Opaque>::type;
+	static_assert(cuda_opaque_handle<Opaque,cuda_t>);
+
+	SNativeHandle() = default;
+	SNativeHandle(const SNativeHandle&) = default;
+	SNativeHandle(const cuda_t& native) { operator=(native); }
+	SNativeHandle(const Opaque& opaque) { operator=(opaque); }
+
+	SNativeHandle& operator=(const SNativeHandle&) = default;
+	SNativeHandle& operator=(const cuda_t& native) { value = native; return *this; }
+	SNativeHandle& operator=(const Opaque& opaque) { operator Opaque&() = opaque; return *this; }
+
+	operator cuda_t&() { return value; }
+	operator const cuda_t&() const { return value; }
+	operator Opaque&() { return reinterpret_cast<Opaque&>(value); }
+	operator const Opaque&() const { return reinterpret_cast<const Opaque&>(value); }
+
+	Opaque* opaque() { return &static_cast<Opaque&>(*this); }
+	const Opaque* opaque() const { return &static_cast<const Opaque&>(*this); }
+	Opaque asOpaque() const { return static_cast<const Opaque&>(*this); }
+
+	cuda_t value = {};
 };
 
-class NBL_API2 CCUDADeviceAccessor
+using SCUdevice = SNativeHandle<cuda_interop::SCUdevice>;
+using SCUcontext = SNativeHandle<cuda_interop::SCUcontext>;
+using SCUdeviceptr = SNativeHandle<cuda_interop::SCUdeviceptr>;
+using SCUexternalMemory = SNativeHandle<cuda_interop::SCUexternalMemory>;
+using SCUexternalSemaphore = SNativeHandle<cuda_interop::SCUexternalSemaphore>;
+
+inline bool isBuildCUDAVersionCompatible()
 {
-	public:
-		static CUdevice getInternalObject(const CCUDADevice& device);
-		static CUcontext getContext(const CCUDADevice& device);
-		static size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size);
-		static core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params);
-};
+	const auto buildVersion = CCUDAHandler::getBuildCUDAVersion();
+	return buildVersion==0u || buildVersion==CUDA_VERSION;
+}
 
-class NBL_API2 CCUDAExportableMemoryAccessor
+inline bool isDeviceLocal(CUmemLocationType location)
 {
-	public:
-		static CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory);
-};
+	return location==CU_MEM_LOCATION_TYPE_DEVICE;
+}
+
+// Opt-in native CUDA declarations. Nabla owns the definitions.
+NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
+NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
+NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
+NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
+NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
+NBL_API2 const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
+NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
+NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
+NBL_API2 SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
+NBL_API2 SPTXResult compileDirectlyToPTX(
+	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
+);
 
-class NBL_API2 CCUDAImportedMemoryAccessor
+inline size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size)
 {
-	public:
-		static CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory);
-		static CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer);
-};
+	return device.roundToGranularity(static_cast<uint32_t>(location),size);
+}
 
-class NBL_API2 CCUDAImportedSemaphoreAccessor
+inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params)
 {
-	public:
-		static CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore);
-};
+	return device.createExportableMemory({params.size,params.alignment,static_cast<uint32_t>(params.location)});
+}
 
 }
 
diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index 7c1d1f272b..d4eb711cd2 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -5,6 +5,7 @@
 #define _NBL_VIDEO_C_CUDA_DEVICE_H_
 
 #include "nbl/video/declarations.h"
+#include "nbl/video/CUDAInteropHandles.h"
 #include "nbl/video/CCUDAExportableMemory.h"
 #include "nbl/video/CCUDAImportedMemory.h"
 #include "nbl/video/CCUDAImportedSemaphore.h"
@@ -75,9 +76,21 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 		}
 
 		const CCUDAHandler* getHandler() const { return m_handler.get();  }
+		cuda_interop::SCUdevice getInternalObject() const;
+		cuda_interop::SCUcontext getContext() const;
+
+		struct SExportableMemoryCreationParams
+		{
+			size_t size;
+			uint32_t alignment;
+			uint32_t locationType;
+		};
+
+		size_t roundToGranularity(uint32_t locationType, size_t size) const;
 
 		bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); }
 
+		core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(SExportableMemoryCreationParams&& params);
 		core::smart_refctd_ptr<CCUDAImportedMemory> importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem);
 
 		core::smart_refctd_ptr<CCUDAImportedSemaphore> importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sem);
diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h
index 6d29739408..6243bd8c73 100644
--- a/include/nbl/video/CCUDAExportableMemory.h
+++ b/include/nbl/video/CCUDAExportableMemory.h
@@ -5,6 +5,7 @@
 #define _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_
 
 #include "nbl/video/declarations.h"
+#include "nbl/video/CUDAInteropHandles.h"
 
 #include <memory>
 #include <utility>
@@ -32,6 +33,7 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 
 		~CCUDAExportableMemory() override;
 
+		cuda_interop::SCUdeviceptr getDeviceptr() const;
 		core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
 
 	private:
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index bb2d12c637..2ce6541696 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -50,6 +50,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 {
 	public:
 		static core::smart_refctd_ptr<CCUDAHandler> create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger);
+		static uint32_t getBuildCUDAVersion();
 
 		inline core::SRange<system::IFile* const> getSTDHeaders()
 		{
diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h
index ac41c110a2..454088b7ae 100644
--- a/include/nbl/video/CCUDAImportedMemory.h
+++ b/include/nbl/video/CCUDAImportedMemory.h
@@ -2,6 +2,7 @@
 #define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_
 
 #include "nbl/video/declarations.h"
+#include "nbl/video/CUDAInteropHandles.h"
 
 #include <memory>
 #include <utility>
@@ -20,6 +21,8 @@ class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
 {
 	public:
 		~CCUDAImportedMemory() override;
+		cuda_interop::SCUexternalMemory getInternalObject() const;
+		bool getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const;
 
 	private:
 		friend class CCUDADevice;
diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h
index c8bf77313e..5a4f28abde 100644
--- a/include/nbl/video/CCUDAImportedSemaphore.h
+++ b/include/nbl/video/CCUDAImportedSemaphore.h
@@ -5,6 +5,7 @@
 #define _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_
 
 #include "nbl/video/declarations.h"
+#include "nbl/video/CUDAInteropHandles.h"
 
 #include <memory>
 #include <utility>
@@ -23,6 +24,7 @@ class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted
 {
 	public:
 		~CCUDAImportedSemaphore() override;
+		cuda_interop::SCUexternalSemaphore getInternalObject() const;
 
 	private:
 		friend class CCUDADevice;
diff --git a/include/nbl/video/CUDAInterop.h b/include/nbl/video/CUDAInterop.h
index 57e92ae647..efea886b96 100644
--- a/include/nbl/video/CUDAInterop.h
+++ b/include/nbl/video/CUDAInterop.h
@@ -4,6 +4,7 @@
 #ifndef _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_
 #define _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_
 
+#include "nbl/video/CUDAInteropHandles.h"
 #include "nbl/video/CCUDADevice.h"
 #include "nbl/video/CCUDAExportableMemory.h"
 #include "nbl/video/CCUDAHandler.h"
diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h
new file mode 100644
index 0000000000..88401f6a1f
--- /dev/null
+++ b/include/nbl/video/CUDAInteropHandles.h
@@ -0,0 +1,40 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_VIDEO_CUDA_INTEROP_HANDLES_H_INCLUDED_
+#define _NBL_VIDEO_CUDA_INTEROP_HANDLES_H_INCLUDED_
+
+#include <cstddef>
+#include <cstdint>
+
+namespace nbl::video::cuda_interop
+{
+
+struct alignas(alignof(int32_t)) SCUdevice
+{
+	uint8_t value[sizeof(int32_t)] = {};
+};
+
+struct alignas(alignof(void*)) SCUcontext
+{
+	uint8_t value[sizeof(void*)] = {};
+};
+
+struct alignas(alignof(uintptr_t)) SCUdeviceptr
+{
+	uint8_t value[sizeof(uintptr_t)] = {};
+};
+
+struct alignas(alignof(void*)) SCUexternalMemory
+{
+	uint8_t value[sizeof(void*)] = {};
+};
+
+struct alignas(alignof(void*)) SCUexternalSemaphore
+{
+	uint8_t value[sizeof(void*)] = {};
+};
+
+}
+
+#endif
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 2ce46cbc93..fca288a98f 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -2,10 +2,11 @@
 
 ## Layout
 
-- `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and its implementation in `src/nbl/video/CCUDA*.cpp`.
-- Those headers do not include CUDA SDK headers. Consumers that only link `Nabla::Nabla` do not need `cuda.h`, `nvrtc.h`, or a CUDA SDK install just to parse Nabla headers.
-- `Nabla::ext::CUDAInterop` is an `INTERFACE` target for native CUDA opt-in. It builds no library. It only adds `CUDAInteropNative.h`, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop.
-- `CUDAInteropNative.h` is the only public opt-in header that includes CUDA SDK headers and exposes `cuda_native::*Accessor` classes for CUDA Driver API and NVRTC types.
+- `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and the implementation in `src/nbl/video/CCUDA*.cpp`.
+- The public Nabla headers do not include `cuda.h`, `nvrtc.h`, or other CUDA SDK headers. A consumer that only links `Nabla::Nabla` does not need a CUDA SDK install just to parse Nabla headers.
+- CUDA native state is stored behind incomplete `SNativeState` members in Nabla classes. Public headers expose fixed-layout opaque value handles from `nbl/video/CUDAInteropHandles.h`.
+- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the native opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop.
+- `CUDAInteropNative.h` is the only opt-in header that includes CUDA SDK headers. It maps Nabla opaque handles to CUDA native types with `cuda_native::SNativeHandle`.
 
 ## CMake Usage
 
@@ -54,33 +55,39 @@ This affects native opt-in compilation and generated runtime header discovery on
 auto handler = nbl::video::CCUDAHandler::create(system, std::move(logger));
 auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDevice);
 
-auto memory = nbl::video::cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, {
+if (!nbl::video::cuda_native::isBuildCUDAVersionCompatible())
+    return false;
+
+auto memory = nbl::video::cuda_native::createExportableMemory(*cudaDevice, {
     .size = size,
     .alignment = alignment,
     .location = CU_MEM_LOCATION_TYPE_DEVICE,
 });
 
+nbl::video::cuda_native::SCUdeviceptr mapped;
+if (importedMemory)
+    importedMemory->getMappedBuffer(mapped.opaque());
+
+CUdeviceptr rawMapped = mapped;
+CUdeviceptr rawExported = nbl::video::cuda_native::SCUdeviceptr(memory->getDeviceptr());
+
 std::string log;
-std::string cudaSource = loadKernelText();
-auto compile = nbl::video::cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
+auto compile = nbl::video::cuda_native::compileDirectlyToPTX(
     *handler,
     std::move(cudaSource),
     "kernel.cu",
     cudaDevice->geDefaultCompileOptions(),
-    log,
-    0,
-    nullptr,
-    nullptr
+    log
 );
 ```
 
-Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC types directly through accessor classes:
+Native access is not a full CUDA wrapper. It is the glue between Nabla resource lifetime and raw CUDA interop:
 
-- `CCUDAHandlerAccessor` exposes CUDA/NVRTC function tables, NVRTC program helpers, PTX compilation, native device enumeration, and default error handling.
-- `CCUDADeviceAccessor` exposes `CUdevice`, `CUcontext`, memory granularity, and CUDA allocation creation.
-- `CCUDAExportableMemoryAccessor`, `CCUDAImportedMemoryAccessor`, and `CCUDAImportedSemaphoreAccessor` expose the raw CUDA handles needed for interop.
-- Accessor methods take explicit Nabla references. Callers dereference `smart_refctd_ptr` at the call site instead of going through pointer/smart-pointer convenience overloads.
-- `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`. There is no optional output pointer in the public API.
+- `cuda_native::getCUDAFunctionTable` and `cuda_native::getNVRTCFunctionTable` expose the loaded Driver API and NVRTC tables.
+- `cuda_native::SNativeHandle<T>` converts between SDK-free Nabla opaque handles and CUDA native handles such as `CUdeviceptr`.
+- `cuda_native::createExportableMemory` and `cuda_native::roundToGranularity` keep CUDA enum usage in the opt-in header while Nabla stores only integer/opaque data in its public ABI.
+- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. Native opt-in code can pass `cuda_native::SCUdeviceptr::opaque()` and then use the wrapper as `CUdeviceptr`.
+- `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`.
 
 Smoke examples:
 
@@ -92,10 +99,10 @@ Smoke examples:
 
 - `CCUDAHandler`, `CCUDADevice`, `CCUDAExportableMemory`, `CCUDAImportedMemory`, and `CCUDAImportedSemaphore` are exported from `Nabla.dll` through the normal Nabla ABI.
 - Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes.
+- Opaque handle types are small trivially-copyable byte arrays with fixed size/alignment chosen to match CUDA native handle storage. The native opt-in header validates this with `static_assert`s against the SDK used by the consumer.
 - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state.
-- `CUDAInteropNative.h` declares exported accessor classes whose definitions still live in `Nabla.dll`. The opt-in header owns only the CUDA SDK surface. Nabla owns the implementation and ABI.
-- Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small fixed-layout parameter/result structs.
-- SDK-sized arrays and other layouts derived from CUDA SDK constants stay private to Nabla. A consumer can build native opt-in code with its own compatible SDK independently from the SDK used to build Nabla.
+- SDK-sized arrays, CUDA enum storage, and CUDA implementation headers stay private to Nabla.
+- A consumer can build native opt-in code with its own compatible SDK independently from the SDK used to build Nabla. Leaky/native code can check `cuda_native::isBuildCUDAVersionCompatible()` when exact CUDA SDK version matching is required.
 - Runtime include-option construction is header-only and is not part of the exported ABI.
 - The loaded CUDA driver and NVRTC runtime are validated at runtime.
 
@@ -109,7 +116,7 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud
 - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
 - Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
 - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths.
-- `cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Default discovery is cached after the first call.
+- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Default discovery is cached after the first call.
 
 Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit.
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index 5d35ec8bed..52fdb43539 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -23,7 +23,7 @@ using namespace nbl::video;
 	core::smart_refctd_ptr<IDeviceMemoryAllocation> vulkanMemory,
 	core::smart_refctd_ptr<ISemaphore> vulkanSemaphore)
 {
-	auto cudaMemory = cuda_native::CCUDADeviceAccessor::createExportableMemory(cudaDevice, {
+	auto cudaMemory = cuda_native::createExportableMemory(cudaDevice, {
 		.size = 4096,
 		.alignment = 4096,
 		.location = CU_MEM_LOCATION_TYPE_DEVICE,
@@ -35,18 +35,20 @@ using namespace nbl::video;
 	auto importedFromVulkan = cudaDevice.importExternalMemory(std::move(vulkanMemory));
 	auto importedSemaphore = cudaDevice.importExternalSemaphore(std::move(vulkanSemaphore));
 
-	CUdeviceptr mappedVulkanMemory = 0;
+	cuda_native::SCUdeviceptr mappedVulkanMemory;
 	if (importedFromVulkan)
-		cuda_native::CCUDAImportedMemoryAccessor::getMappedBuffer(*importedFromVulkan,&mappedVulkanMemory);
+		importedFromVulkan->getMappedBuffer(mappedVulkanMemory.opaque());
 
-	const CUdeviceptr cudaDevicePtr = cuda_native::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaMemory);
-	const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::CCUDAImportedSemaphoreAccessor::getInternalObject(*importedSemaphore):nullptr;
+	const CUdeviceptr cudaDevicePtr = cuda_native::SCUdeviceptr(cudaMemory->getDeviceptr());
+	CUexternalSemaphore cudaSemaphore = nullptr;
+	if (importedSemaphore)
+		cudaSemaphore = cuda_native::SCUexternalSemaphore(importedSemaphore->getInternalObject());
 	return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore;
 }
 
 bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 {
-	auto& cuda = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(handler);
+	auto& cuda = cuda_native::getCUDAFunctionTable(handler);
 
 	CUcontext context = nullptr;
 	if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS)
@@ -98,7 +100,7 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler)
 	)cuda";
 
 	std::string log;
-	auto compile = cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX(
+	auto compile = cuda_native::compileDirectlyToPTX(
 		handler,
 		std::string(Source),
 		"cuda_fp16_discovery_probe.cu",
@@ -124,7 +126,9 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 		if (!isAPILoaded())
 			return false;
 
-		static_assert(std::is_same_v<decltype(nbl::video::cuda_native::CCUDADeviceAccessor::getInternalObject(std::declval<const nbl::video::CCUDADevice&>())), CUdevice>);
+		static_assert(std::is_same_v<nbl::video::cuda_native::SCUdevice::cuda_t, CUdevice>);
+		if (!nbl::video::cuda_native::isBuildCUDAVersionCompatible())
+			return false;
 
 		#ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON
 		const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment({}, {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON});
@@ -147,7 +151,7 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 		if (!cudaFp16HeaderCompileProbe(*handler))
 			return false;
 
-		const auto& devices = nbl::video::cuda_native::CCUDAHandlerAccessor::getAvailableDevices(*handler);
+		const auto& devices = nbl::video::cuda_native::getAvailableDevices(*handler);
 		if (devices.empty())
 			return true;
 
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index 359cd093a1..2ed02a6282 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -34,11 +34,11 @@ CCUDADevice::CCUDADevice(
 	m_defaultCompileOptions.push_back("-dc");
 	m_defaultCompileOptions.push_back("-use_fast_math");
 
-  const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler);
+  const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
 	
-	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle)))
+	if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle)))
 		assert(false);
-	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context)))
+	if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context)))
 		assert(false);
 
 	for (uint32_t locationType = 0; locationType < m_native->allocationGranularity.size(); ++locationType)
@@ -58,32 +58,31 @@ CCUDADevice::CCUDADevice(
       .win32HandleMetaData = &metadata,
   #endif
     };
-		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)))
+		if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)))
 			assert(false);
 	}
 }
 
-namespace cuda_native
+cuda_interop::SCUdevice CCUDADevice::getInternalObject() const
 {
-
-CUdevice CCUDADeviceAccessor::getInternalObject(const CCUDADevice& device)
-{
-	return SAccess::native(device).handle;
+	return cuda_native::SCUdevice(cuda_native::SAccess::native(*this).handle).asOpaque();
 }
 
-CUcontext CCUDADeviceAccessor::getContext(const CCUDADevice& device)
+cuda_interop::SCUcontext CCUDADevice::getContext() const
 {
-	return SAccess::native(device).context;
+	return cuda_native::SCUcontext(cuda_native::SAccess::native(*this).context).asOpaque();
 }
 
-size_t CCUDADeviceAccessor::roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size)
+size_t CCUDADevice::roundToGranularity(uint32_t locationType, size_t size) const
 {
-	const auto& granularity = SAccess::native(device).allocationGranularity[location];
+	if (locationType>=m_native->allocationGranularity.size())
+		return 0u;
+	const auto& granularity = m_native->allocationGranularity[locationType];
+	if (granularity==0u)
+		return 0u;
 	return ((size - 1) / granularity + 1) * granularity;
 }
 
-}
-
 static bool isDeviceLocal(CUmemLocationType location)
 {
 	return location==CU_MEM_LOCATION_TYPE_DEVICE;
@@ -93,7 +92,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 {
 	const auto handler = device.getHandler();
 	const auto& native = cuda_native::SAccess::native(device);
-	const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*handler);
+	const auto& cu = cuda_native::getCUDAFunctionTable(*handler);
 	
 	CUdeviceptr ptr = 0;
 	if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err)
@@ -101,7 +100,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 
 	if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err)
 	{
-		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
+		if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
 			assert(false);
 		return err;
 	}
@@ -113,9 +112,9 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 
 	if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err)
 	{
-		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size)))
+		if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size)))
 			assert(false);
-		if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
+		if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
 			assert(false);
 		return err;
 	}
@@ -125,23 +124,23 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 	return CUDA_SUCCESS;
 }
 
-namespace cuda_native
-{
-
-core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADeviceAccessor::createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& inParams)
+core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(SExportableMemoryCreationParams&& inParams)
 {
-	const auto handler = device.getHandler();
-	auto& native = SAccess::native(device);
-	auto logger = SAccess::logger(device);
+	const auto handler = getHandler();
+	auto& native = cuda_native::SAccess::native(*this);
+	auto logger = cuda_native::SAccess::logger(*this);
+	const auto location = static_cast<CUmemLocationType>(inParams.locationType);
 
 	CCUDAExportableMemory::SCachedCreationParams params = {
 		.size = inParams.size,
 		.alignment = inParams.alignment,
-		.granularSize = CCUDADeviceAccessor::roundToGranularity(device, inParams.location, inParams.size),
-		.deviceLocal = isDeviceLocal(inParams.location)
+		.granularSize = roundToGranularity(inParams.locationType, inParams.size),
+		.deviceLocal = isDeviceLocal(location)
 	};
+	if (params.granularSize==0u)
+		return nullptr;
 
-	auto& cu = CCUDAHandlerAccessor::getCUDAFunctionTable(*handler);
+	auto& cu = cuda_native::getCUDAFunctionTable(*handler);
 	
 #ifdef _WIN32
 	OBJECT_ATTRIBUTES metadata = {
@@ -151,14 +150,14 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADeviceAccessor::createExporta
 
 	 const auto prop = CUmemAllocationProp{
 		.type = CU_MEM_ALLOCATION_TYPE_PINNED,
-		.requestedHandleTypes = SAccess::allocationHandleType(),
-		.location = { .type = inParams.location, .id = native.handle },
+		.requestedHandleTypes = cuda_native::SAccess::allocationHandleType(),
+		.location = { .type = location, .id = native.handle },
 #ifdef _WIN32
 		.win32HandleMetaData = &metadata,
 #endif
 	};
 
-	auto nativeState = SAccess::makeExportableMemoryNativeState();
+	auto nativeState = cuda_native::SAccess::makeExportableMemoryNativeState();
 
 	CUmemGenericAllocationHandle mem;
 	if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err)
@@ -170,16 +169,16 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADeviceAccessor::createExporta
 	if (auto err = cu.pcuMemExportToShareableHandle(&params.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err)
 	{
 		logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR);
-		if (!CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemRelease(mem)))
+		if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem)))
 			assert(false);
 		return nullptr;
 	}
 
-	if (const auto err = reserveAddressAndMapMemory(device,&SAccess::deviceptr(*nativeState), params.granularSize, params.alignment, inParams.location, mem); CUDA_SUCCESS != err)
+	if (const auto err = reserveAddressAndMapMemory(*this,&cuda_native::SAccess::deviceptr(*nativeState), params.granularSize, params.alignment, location, mem); CUDA_SUCCESS != err)
 	{
 		logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
 
-		if (!CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemRelease(mem)))
+		if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem)))
 			assert(false);
 
 		bool closeSucceed = CloseExternalHandle(params.externalHandle);
@@ -195,14 +194,12 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADeviceAccessor::createExporta
 		return nullptr;
 	}
 	
-	return SAccess::makeExportableMemory(core::smart_refctd_ptr<CCUDADevice>(&device),std::move(params),std::move(nativeState));
-}
-
+	return cuda_native::SAccess::makeExportableMemory(core::smart_refctd_ptr<CCUDADevice>(this),std::move(params),std::move(nativeState));
 }
 
 core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem)
 {
-	const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler);
+	const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
 	const auto handleType = mem->getCreationParams().externalHandleType;
 
 	if (!handleType) return nullptr;
@@ -233,7 +230,7 @@ core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(co
 
 core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sema)
 {
-	auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler);
+	auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
 	auto handleType = sema->getCreationParams().externalHandleTypes.value;
 
 	if (!handleType)
@@ -266,7 +263,7 @@ core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaph
 
 CCUDADevice::~CCUDADevice()
 {
-	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context)))
+	if (!cuda_native::defaultHandleResult(*m_handler, cuda_native::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context)))
 		assert(false);
 }
 
@@ -298,6 +295,26 @@ CCUDADevice::CCUDADevice(
 
 CCUDADevice::~CCUDADevice() = default;
 
+cuda_interop::SCUdevice CCUDADevice::getInternalObject() const
+{
+	return {};
+}
+
+cuda_interop::SCUcontext CCUDADevice::getContext() const
+{
+	return {};
+}
+
+size_t CCUDADevice::roundToGranularity(uint32_t, size_t) const
+{
+	return 0u;
+}
+
+core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(SExportableMemoryCreationParams&&)
+{
+	return nullptr;
+}
+
 core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&&)
 {
 	return nullptr;
diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp
index f84169e38f..6c77736628 100644
--- a/src/nbl/video/CCUDAExportableMemory.cpp
+++ b/src/nbl/video/CCUDAExportableMemory.cpp
@@ -52,12 +52,12 @@ core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsM
 
 CCUDAExportableMemory::~CCUDAExportableMemory()
 {
-	const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler());
+	const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
 
-	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize)))
+	if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize)))
 		assert(false);
 
-	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize)))
+	if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize)))
 		assert(false);
 
   bool closeSucceed = CloseExternalHandle(m_params.externalHandle);
@@ -65,15 +65,11 @@ CCUDAExportableMemory::~CCUDAExportableMemory()
 
 }
 
-namespace cuda_native
+cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const
 {
-
-CUdeviceptr CCUDAExportableMemoryAccessor::getDeviceptr(const CCUDAExportableMemory& memory)
-{
-	return SAccess::native(memory).ptr;
+	return cuda_native::SCUdeviceptr(m_native->ptr).asOpaque();
 }
 
-}
 }
 
 #else
@@ -102,6 +98,11 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDAExportableMemory::create(core
 
 CCUDAExportableMemory::~CCUDAExportableMemory() = default;
 
+cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const
+{
+	return {};
+}
+
 core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const
 {
 	return nullptr;
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 78434d9bd5..4168612e61 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -340,10 +340,15 @@ CCUDAHandler::CCUDAHandler(
 
 CCUDAHandler::~CCUDAHandler() = default;
 
+uint32_t CCUDAHandler::getBuildCUDAVersion()
+{
+	return CUDA_VERSION;
+}
+
 namespace cuda_native
 {
 
-bool CCUDAHandlerAccessor::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
+bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
 {
 	switch (result)
 	{
@@ -709,12 +714,12 @@ bool CCUDAHandlerAccessor::defaultHandleResult(CUresult result, const system::lo
 	return false;
 }
 
-bool CCUDAHandlerAccessor::defaultHandleResult(const CCUDAHandler& handler, CUresult result)
+bool defaultHandleResult(const CCUDAHandler& handler, CUresult result)
 {
-	return CCUDAHandlerAccessor::defaultHandleResult(result,SAccess::logger(handler));
+	return defaultHandleResult(result,SAccess::logger(handler));
 }
 
-bool CCUDAHandlerAccessor::defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
+bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
 {
 	switch (result)
 	{
@@ -862,22 +867,22 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 namespace cuda_native
 {
 
-const CUDA& CCUDAHandlerAccessor::getCUDAFunctionTable(const CCUDAHandler& handler)
+const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler)
 {
 	return SAccess::native(handler).cuda;
 }
 
-const NVRTC& CCUDAHandlerAccessor::getNVRTCFunctionTable(const CCUDAHandler& handler)
+const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler)
 {
 	return SAccess::native(handler).nvrtc;
 }
 
-const core::vector<SCUDADeviceInfo>& CCUDAHandlerAccessor::getAvailableDevices(const CCUDAHandler& handler)
+const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler)
 {
 	return SAccess::native(handler).availableDevices;
 }
 
-nvrtcResult CCUDAHandlerAccessor::createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
+nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
 {
 #if defined(_NBL_WINDOWS_API_)
 	source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n");
@@ -889,12 +894,12 @@ nvrtcResult CCUDAHandlerAccessor::createProgram(CCUDAHandler& handler, nvrtcProg
 	return SAccess::native(handler).nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames);
 }
 
-nvrtcResult CCUDAHandlerAccessor::compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options)
+nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options)
 {
 	return SAccess::native(handler).nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin());
 }
 
-nvrtcResult CCUDAHandlerAccessor::getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log)
+nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log)
 {
 	size_t _size = 0ull;
 	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetProgramLogSize(prog, &_size);
@@ -907,7 +912,7 @@ nvrtcResult CCUDAHandlerAccessor::getProgramLog(const CCUDAHandler& handler, nvr
 	return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data());
 }
 
-SPTXResult CCUDAHandlerAccessor::getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
+SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
 {
 	size_t _size = 0ull;
 	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size);
@@ -945,15 +950,15 @@ static SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult r
 
 	const auto* optionsBegin = options.empty() ? nullptr:options.data();
 	const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size();
-	result = CCUDAHandlerAccessor::compileProgram(handler,program,{optionsBegin,optionsEnd});
-	CCUDAHandlerAccessor::getProgramLog(handler,program,log);
+	result = compileProgram(handler,program,{optionsBegin,optionsEnd});
+	getProgramLog(handler,program,log);
 	if (result!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
-	return CCUDAHandlerAccessor::getPTX(handler,program);
+	return getPTX(handler,program);
 }
 
-SPTXResult CCUDAHandlerAccessor::compileDirectlyToPTX(
+SPTXResult compileDirectlyToPTX(
 	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
 	std::string& log, const int headerCount, const char* const* headerContents, const char* const* includeNames)
 {
@@ -965,7 +970,7 @@ SPTXResult CCUDAHandlerAccessor::compileDirectlyToPTX(
 			SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program);
 	});
 
-	result = CCUDAHandlerAccessor::createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames);
+	result = createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames);
 	return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log);
 }
 
@@ -1095,6 +1100,11 @@ CCUDAHandler::CCUDAHandler(
 
 CCUDAHandler::~CCUDAHandler() = default;
 
+uint32_t CCUDAHandler::getBuildCUDAVersion()
+{
+	return 0u;
+}
+
 core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr<system::ILogger>&&)
 {
 	return nullptr;
diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp
index 9145fe18ac..404323a365 100644
--- a/src/nbl/video/CCUDAImportedMemory.cpp
+++ b/src/nbl/video/CCUDAImportedMemory.cpp
@@ -18,31 +18,34 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> dev
 	assert(m_native);
 }
 
-namespace cuda_native
+cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const
 {
-
-CUexternalMemory CCUDAImportedMemoryAccessor::getInternalObject(const CCUDAImportedMemory& memory)
-{
-  return SAccess::native(memory).handle;
+  return cuda_native::SCUexternalMemory(m_native->handle).asOpaque();
 }
 
-CUresult CCUDAImportedMemoryAccessor::getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer)
+bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const
 {
+  if (!mappedBuffer)
+	  return false;
+
   CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {};
   bufferDesc.offset = 0;
-  bufferDesc.size = SAccess::source(memory)->getAllocationSize();
+  bufferDesc.size = m_src->getAllocationSize();
 
-  const auto& cu = CCUDAHandlerAccessor::getCUDAFunctionTable(*SAccess::device(memory)->getHandler());
-  return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, SAccess::native(memory).handle, &bufferDesc);
+  CUdeviceptr nativeMappedBuffer = 0;
+  const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
+  const auto result = cu.pcuExternalMemoryGetMappedBuffer(&nativeMappedBuffer, m_native->handle, &bufferDesc);
+  if (!cuda_native::defaultHandleResult(*m_device->getHandler(),result))
+	  return false;
   
-}
-
+  *mappedBuffer = cuda_native::SCUdeviceptr(nativeMappedBuffer).asOpaque();
+  return true;
 }
 
 CCUDAImportedMemory::~CCUDAImportedMemory()
 {
-  auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler());
-	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle)))
+  auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
+	if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle)))
 		assert(false);
 }
 
@@ -66,6 +69,16 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> dev
 
 CCUDAImportedMemory::~CCUDAImportedMemory() = default;
 
+cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const
+{
+	return {};
+}
+
+bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr*) const
+{
+	return false;
+}
+
 }
 
 #endif
diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp
index 5d7d3e07ae..4fce78788c 100644
--- a/src/nbl/video/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/video/CCUDAImportedSemaphore.cpp
@@ -17,20 +17,15 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevic
 	assert(m_native);
 }
 
-namespace cuda_native
+cuda_interop::SCUexternalSemaphore CCUDAImportedSemaphore::getInternalObject() const
 {
-
-CUexternalSemaphore CCUDAImportedSemaphoreAccessor::getInternalObject(const CCUDAImportedSemaphore& semaphore)
-{
-	return SAccess::native(semaphore).handle;
-}
-
+	return cuda_native::SCUexternalSemaphore(m_native->handle).asOpaque();
 }
 
 CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
 {
-	auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler());
-	if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle)))
+	auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
+	if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle)))
 		assert(false);
 }
 }
@@ -53,6 +48,11 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevic
 
 CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default;
 
+cuda_interop::SCUexternalSemaphore CCUDAImportedSemaphore::getInternalObject() const
+{
+	return {};
+}
+
 }
 
 #endif // _NBL_COMPILE_WITH_CUDA_

From ec259cba130d8df8b3445bbf0b800e9005f25639 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sat, 9 May 2026 13:22:30 +0200
Subject: [PATCH 29/51] Clean CUDA interop boundary

---
 examples_tests                                |  2 +-
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 84 ++++++++++++-------
 include/nbl/video/CCUDADevice.h               | 29 +------
 include/nbl/video/CCUDAHandler.h              |  4 +
 include/nbl/video/CUDAInteropHandles.h        | 37 ++++----
 src/nbl/ext/CUDAInterop/README.md             | 48 ++++++-----
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   | 16 ++--
 src/nbl/video/CCUDADevice.cpp                 | 60 +++++++++++--
 src/nbl/video/CCUDAExportableMemory.cpp       |  2 +-
 src/nbl/video/CCUDAHandler.cpp                | 35 ++++----
 src/nbl/video/CCUDAImportedMemory.cpp         |  4 +-
 src/nbl/video/CCUDAImportedSemaphore.cpp      |  2 +-
 12 files changed, 188 insertions(+), 135 deletions(-)

diff --git a/examples_tests b/examples_tests
index 2d415af102..e289ee14f5 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 2d415af102ebf710ea2bb369b3f0eca5544652f7
+Subproject commit e289ee14f5b8f05004726e6f03c81a9a2e768219
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index 495f3cabc0..77d248dee2 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -1,16 +1,45 @@
 // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
+/*
+	CUDA SDK opt-in boundary for Nabla CUDA interop.
+
+	Public nbl/video CUDA interop headers expose SDK-free cuda_interop::SCU* opaque handles. This header is the
+	explicit boundary where a consumer accepts CUDA/NVRTC SDK headers, raw CU* types, and Nabla helper APIs whose
+	signatures use CUDA SDK types. This happens by linking Nabla::ext::CUDAInterop and including this file, which
+	includes cuda.h and nvrtc.h. The CUDA SDK becomes a compile-time requirement only for that SDK opt-in
+	consumer.
+
+	The exported definitions stay in Nabla because they are glue between the Nabla world and the CUDA world:
+	dynamic Driver API/NVRTC loader access, NVRTC program helpers, error handling, runtime header discovery, and
+	CUDA/Vulkan resource interop lifetime. This header only exposes the CUDA-typed signatures for that glue after
+	the consumer explicitly opts in. Nabla::ext::CUDAInterop is the build-system edge for this SDK-typed surface.
+	It is not a separate owner of these definitions. Code that only consumes Nabla::Nabla does not need CUDA SDK
+	headers and does not parse CUDA/NVRTC declarations.
+
+	Keeping SDK-defined types out of Nabla's public ABI is intentional. CUDA headers have changed observable
+	compile-time constants across SDK versions:
+	- CUDA Toolkit 9.0 documented CU_CTX_FLAGS_MASK as 0x1f. CUDA 12.1, 12.5, and 13.2 define it as 0xff.
+	- CUDA Toolkit 9.0 documented CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS as 93. CUDA 12.1, 12.5,
+	  and 13.2 keep 93 as CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V1 and define the unsuffixed name
+	  as 122.
+	- CUDA Toolkit 9.0 documented CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR as 94. CUDA 12.1, 12.5,
+	  and 13.2 keep 94 as CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V1 and define the unsuffixed name
+	  as 123.
+
+	If these SDK declarations leak through public Nabla headers, consumers can silently compile against a
+	different CUDA interpretation than the one used to build the interop implementation. That is especially
+	problematic for installed packages, plugins, and separately built downstream projects. The opaque handles
+	keep Nabla's public ABI independent from CUDA SDK headers. This opt-in header then validates handle
+	size/alignment against the SDK selected by the SDK opt-in consumer.
+*/
 #ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_
 #define _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_
-
+#include <string>
 #include "nbl/video/CUDAInterop.h"
-
 #include "nbl/asset/ICPUBuffer.h"
 #include "nbl/system/DynamicFunctionCaller.h"
 
-#include <string>
-
 #include "cuda.h"
 #include "nvrtc.h"
 #include <type_traits>
@@ -148,13 +177,6 @@ struct SCUDADeviceInfo
 	CUuuid uuid = {};
 };
 
-struct SExportableMemoryCreationParams
-{
-	size_t size;
-	uint32_t alignment;
-	CUmemLocationType location;
-};
-
 struct SPTXResult
 {
 	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
@@ -167,7 +189,14 @@ concept cuda_opaque_handle =
 	std::is_trivially_copyable_v<Native> &&
 	sizeof(Opaque)==sizeof(Native) &&
 	alignof(Opaque)==alignof(Native);
+/*
+	Map Nabla opaque handles to CUDA SDK handle types.
 
+	This is deliberately small. It is not an attempt to wrap CUDA. It only gives SDK opt-in code a convenient
+	way to pass Nabla-owned opaque handles to CUDA C APIs while checking that the public opaque type has the same
+	layout as the CUDA type visible in this translation unit. If a future SDK changes one of these handle layouts,
+	the SDK opt-in build fails here instead of letting ABI drift propagate through packaged Nabla headers.
+*/
 template<typename Opaque>
 struct SOpaqueCUDAType;
 
@@ -176,7 +205,13 @@ template<> struct SOpaqueCUDAType<cuda_interop::SCUcontext> { using type = CUcon
 template<> struct SOpaqueCUDAType<cuda_interop::SCUdeviceptr> { using type = CUdeviceptr; };
 template<> struct SOpaqueCUDAType<cuda_interop::SCUexternalMemory> { using type = CUexternalMemory; };
 template<> struct SOpaqueCUDAType<cuda_interop::SCUexternalSemaphore> { using type = CUexternalSemaphore; };
+/*
+	CUDA SDK view of an SDK-free opaque handle.
 
+	The conversions are intentionally available only after including this header. Public Nabla headers expose
+	only the opaque SCU* values. Once a consumer opts in, SNativeHandle restores the CUDA spelling and ergonomics
+	for raw Driver API calls without adding accessors to every interop operation.
+*/
 template<typename Opaque>
 struct SNativeHandle
 {
@@ -215,19 +250,18 @@ inline bool isBuildCUDAVersionCompatible()
 	const auto buildVersion = CCUDAHandler::getBuildCUDAVersion();
 	return buildVersion==0u || buildVersion==CUDA_VERSION;
 }
+/*
+	Nabla interop API declarations with CUDA SDK signatures.
 
-inline bool isDeviceLocal(CUmemLocationType location)
-{
-	return location==CU_MEM_LOCATION_TYPE_DEVICE;
-}
-
-// Opt-in native CUDA declarations. Nabla owns the definitions.
-NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler);
-NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler);
+	These declarations belong to the Nabla interop API. They live behind Nabla::ext::CUDAInterop because their
+	signatures mention CUDA/NVRTC SDK types directly. Keeping them out of nbl/video/CCUDA*.h means Nabla's public
+	API can be parsed and packaged without CUDA SDK headers. Nabla still owns the exported glue definitions.
+	Consumers accept this SDK-typed API surface only by including this header and linking the explicit interop
+	target.
+*/
 NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
 NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
 NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
-NBL_API2 const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler);
 NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
 NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
@@ -237,16 +271,6 @@ NBL_API2 SPTXResult compileDirectlyToPTX(
 	std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
 );
 
-inline size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size)
-{
-	return device.roundToGranularity(static_cast<uint32_t>(location),size);
-}
-
-inline core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params)
-{
-	return device.createExportableMemory({params.size,params.alignment,static_cast<uint32_t>(params.location)});
-}
-
 }
 
 #endif
diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index d4eb711cd2..0f8bd015ed 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -10,7 +10,6 @@
 #include "nbl/video/CCUDAImportedMemory.h"
 #include "nbl/video/CCUDAImportedSemaphore.h"
 
-#include <cstring>
 #include <memory>
 #include <vector>
 
@@ -50,32 +49,13 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 			EVA_80,
 			EVA_COUNT
 		};
-		static inline constexpr const char* virtualArchCompileOption[] = {
-			"-arch=compute_30",
-			"-arch=compute_32",
-			"-arch=compute_35",
-			"-arch=compute_37",
-			"-arch=compute_50",
-			"-arch=compute_52",
-			"-arch=compute_53",
-			"-arch=compute_60",
-			"-arch=compute_61",
-			"-arch=compute_62",
-			"-arch=compute_70",
-			"-arch=compute_72",
-			"-arch=compute_75",
-			"-arch=compute_80"
-		};
-		inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;}
+		E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() const;
 
 		~CCUDADevice() override;
 
-		inline core::SRange<const char* const> geDefaultCompileOptions() const
-		{
-			return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()};
-		}
+		core::SRange<const char* const> geDefaultCompileOptions() const;
 
-		const CCUDAHandler* getHandler() const { return m_handler.get();  }
+		const CCUDAHandler* getHandler() const;
 		cuda_interop::SCUdevice getInternalObject() const;
 		cuda_interop::SCUcontext getContext() const;
 
@@ -88,8 +68,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 
 		size_t roundToGranularity(uint32_t locationType, size_t size) const;
 
-		bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); }
-
 		core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(SExportableMemoryCreationParams&& params);
 		core::smart_refctd_ptr<CCUDAImportedMemory> importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem);
 
@@ -105,7 +83,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 		const system::logger_opt_ptr m_logger;
 		std::vector<const char*> m_defaultCompileOptions;
 		core::smart_refctd_ptr<CVulkanConnection> m_vulkanConnection;
-		IPhysicalDevice* const m_physicalDevice;
 		E_VIRTUAL_ARCHITECTURE m_virtualArchitecture;
 
 		core::smart_refctd_ptr<CCUDAHandler> m_handler;
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index 2ce6541696..db30b08587 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -23,6 +23,8 @@ class IPhysicalDevice;
 
 namespace cuda_native
 {
+class CUDA;
+class NVRTC;
 struct SAccess;
 }
 
@@ -51,6 +53,8 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 	public:
 		static core::smart_refctd_ptr<CCUDAHandler> create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger);
 		static uint32_t getBuildCUDAVersion();
+		const cuda_native::CUDA& getCUDAFunctionTable() const;
+		const cuda_native::NVRTC& getNVRTCFunctionTable() const;
 
 		inline core::SRange<system::IFile* const> getSTDHeaders()
 		{
diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h
index 88401f6a1f..741a04c319 100644
--- a/include/nbl/video/CUDAInteropHandles.h
+++ b/include/nbl/video/CUDAInteropHandles.h
@@ -10,30 +10,25 @@
 namespace nbl::video::cuda_interop
 {
 
-struct alignas(alignof(int32_t)) SCUdevice
+/*
+	SDK-free CUDA handle surrogates used by Nabla's public video API.
+
+	These types are the small glue layer between Nabla and SDK-typed CUDA interop code. They let nbl/video/CCUDA*.h
+	expose CUDA-related objects without including cuda.h or nvrtc.h, so consumers that only link Nabla::Nabla do
+	not inherit CUDA SDK as a public compile-time dependency. CUDAInteropNative.h maps these opaque handles back
+	to the real CU* types and checks their size/alignment against the SDK selected by the opt-in consumer.
+*/
+template<typename Storage>
+struct alignas(alignof(Storage)) SOpaqueCUDAHandle
 {
-	uint8_t value[sizeof(int32_t)] = {};
+	uint8_t value[sizeof(Storage)] = {};
 };
 
-struct alignas(alignof(void*)) SCUcontext
-{
-	uint8_t value[sizeof(void*)] = {};
-};
-
-struct alignas(alignof(uintptr_t)) SCUdeviceptr
-{
-	uint8_t value[sizeof(uintptr_t)] = {};
-};
-
-struct alignas(alignof(void*)) SCUexternalMemory
-{
-	uint8_t value[sizeof(void*)] = {};
-};
-
-struct alignas(alignof(void*)) SCUexternalSemaphore
-{
-	uint8_t value[sizeof(void*)] = {};
-};
+struct SCUdevice : SOpaqueCUDAHandle<int32_t> {};
+struct SCUcontext : SOpaqueCUDAHandle<void*> {};
+struct SCUdeviceptr : SOpaqueCUDAHandle<uintptr_t> {};
+struct SCUexternalMemory : SOpaqueCUDAHandle<void*> {};
+struct SCUexternalSemaphore : SOpaqueCUDAHandle<void*> {};
 
 }
 
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index fca288a98f..a55bafbb9f 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -5,19 +5,19 @@
 - `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and the implementation in `src/nbl/video/CCUDA*.cpp`.
 - The public Nabla headers do not include `cuda.h`, `nvrtc.h`, or other CUDA SDK headers. A consumer that only links `Nabla::Nabla` does not need a CUDA SDK install just to parse Nabla headers.
 - CUDA native state is stored behind incomplete `SNativeState` members in Nabla classes. Public headers expose fixed-layout opaque value handles from `nbl/video/CUDAInteropHandles.h`.
-- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the native opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop.
-- `CUDAInteropNative.h` is the only opt-in header that includes CUDA SDK headers. It maps Nabla opaque handles to CUDA native types with `cuda_native::SNativeHandle`.
+- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the SDK opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop.
+- `CUDAInteropNative.h` is the only opt-in header that includes CUDA SDK headers. It maps Nabla opaque handles to CUDA SDK types with `cuda_native::SNativeHandle`.
 
 ## CMake Usage
 
-Default Nabla usage stays SDK-free:
+`Nabla::Nabla`-only usage stays SDK-free:
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED)
 target_link_libraries(app PRIVATE Nabla::Nabla)
 ```
 
-Native CUDA interop is explicit:
+SDK-typed CUDA interop is explicit:
 
 ```cmake
 find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop)
@@ -39,15 +39,15 @@ nbl_target_link_cuda_interop(native_app PRIVATE
 )
 ```
 
-Consumers can also choose the SDK used for native compilation with:
+Consumers can also choose the SDK used for SDK-typed compilation with:
 
 ```cmake
 cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT=<cuda-root>
 ```
 
-This affects native opt-in compilation and generated runtime header discovery only. It does not rebuild Nabla and does not change the `Nabla.dll` ABI.
+This affects SDK opt-in compilation and generated runtime header discovery only. It does not rebuild Nabla and does not change the `Nabla.dll` ABI.
 
-## Native Usage
+## SDK Opt-In Usage
 
 ```cpp
 #include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
@@ -58,10 +58,10 @@ auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDev
 if (!nbl::video::cuda_native::isBuildCUDAVersionCompatible())
     return false;
 
-auto memory = nbl::video::cuda_native::createExportableMemory(*cudaDevice, {
+auto memory = cudaDevice->createExportableMemory({
     .size = size,
     .alignment = alignment,
-    .location = CU_MEM_LOCATION_TYPE_DEVICE,
+    .locationType = CU_MEM_LOCATION_TYPE_DEVICE,
 });
 
 nbl::video::cuda_native::SCUdeviceptr mapped;
@@ -70,6 +70,8 @@ if (importedMemory)
 
 CUdeviceptr rawMapped = mapped;
 CUdeviceptr rawExported = nbl::video::cuda_native::SCUdeviceptr(memory->getDeviceptr());
+auto& cu = handler->getCUDAFunctionTable();
+auto& nvrtc = handler->getNVRTCFunctionTable();
 
 std::string log;
 auto compile = nbl::video::cuda_native::compileDirectlyToPTX(
@@ -81,46 +83,46 @@ auto compile = nbl::video::cuda_native::compileDirectlyToPTX(
 );
 ```
 
-Native access is not a full CUDA wrapper. It is the glue between Nabla resource lifetime and raw CUDA interop:
+SDK opt-in access is not a full CUDA wrapper. It is the glue between Nabla resource lifetime and raw CUDA interop:
 
-- `cuda_native::getCUDAFunctionTable` and `cuda_native::getNVRTCFunctionTable` expose the loaded Driver API and NVRTC tables.
-- `cuda_native::SNativeHandle<T>` converts between SDK-free Nabla opaque handles and CUDA native handles such as `CUdeviceptr`.
-- `cuda_native::createExportableMemory` and `cuda_native::roundToGranularity` keep CUDA enum usage in the opt-in header while Nabla stores only integer/opaque data in its public ABI.
-- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. Native opt-in code can pass `cuda_native::SCUdeviceptr::opaque()` and then use the wrapper as `CUdeviceptr`.
+- `CCUDAHandler::getCUDAFunctionTable` and `CCUDAHandler::getNVRTCFunctionTable` expose the loaded Driver API and NVRTC tables after SDK opt-in.
+- `cuda_native::SNativeHandle<T>` converts between SDK-free Nabla opaque handles and CUDA SDK handles such as `CUdeviceptr`.
+- CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI.
+- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. SDK opt-in code can pass `cuda_native::SCUdeviceptr::opaque()` and then use the wrapper as `CUdeviceptr`.
 - `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`.
 
 Smoke examples:
 
 - `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` checks that `Nabla::Nabla` headers stay SDK-free.
-- `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` checks default package usage without native opt-in.
-- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks native opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, and raw interop usage.
+- `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` checks `Nabla::Nabla` package usage without SDK opt-in.
+- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks SDK opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, and raw interop usage.
 
 ## ABI
 
 - `CCUDAHandler`, `CCUDADevice`, `CCUDAExportableMemory`, `CCUDAImportedMemory`, and `CCUDAImportedSemaphore` are exported from `Nabla.dll` through the normal Nabla ABI.
 - Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes.
-- Opaque handle types are small trivially-copyable byte arrays with fixed size/alignment chosen to match CUDA native handle storage. The native opt-in header validates this with `static_assert`s against the SDK used by the consumer.
+- Opaque handle types are small trivially-copyable byte arrays with fixed size/alignment chosen to match CUDA SDK handle storage. The SDK opt-in header validates this with `static_assert`s against the SDK used by the consumer.
 - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state.
 - SDK-sized arrays, CUDA enum storage, and CUDA implementation headers stay private to Nabla.
-- A consumer can build native opt-in code with its own compatible SDK independently from the SDK used to build Nabla. Leaky/native code can check `cuda_native::isBuildCUDAVersionCompatible()` when exact CUDA SDK version matching is required.
+- A consumer can build SDK opt-in code with its own compatible SDK independently from the SDK used to build Nabla. SDK-typed code can check `cuda_native::isBuildCUDAVersionCompatible()` when exact CUDA SDK version matching is required.
 - Runtime include-option construction is header-only and is not part of the exported ABI.
 - The loaded CUDA driver and NVRTC runtime are validated at runtime.
 
 ## Runtime Header Discovery
 
-NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`. This is a runtime concern of applications that compile CUDA source with NVRTC, not a default `Nabla::Nabla` package requirement.
+NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`. This is a runtime concern of applications that compile CUDA source with NVRTC, not a `Nabla::Nabla` package requirement.
 
-- `nbl_target_link_cuda_interop` generates `nbl_cuda_interop_runtime.json` for the target that opted into native CUDA interop.
+- `nbl_target_link_cuda_interop` generates `nbl_cuda_interop_runtime.json` for the target that opted into SDK-typed CUDA interop.
 - The JSON is a build artifact. Nabla packages do not install host-specific CUDA paths.
 - Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`.
 - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
 - Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
 - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths.
-- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Default discovery is cached after the first call.
+- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Discovery is cached after the first call.
 
 Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit.
 
-Nabla could ship an app-local bundle of selected CUDA runtime headers and make it available to runtime discovery. That model is allowed by the NVIDIA CUDA EULA for the components listed in Attachment A. Nabla intentionally does not bundle these headers. Because of that, end users should prefer an official CUDA runtime/header package for production machines. An installed toolkit also works, but the full toolkit is mainly for developers compiling Nabla or native CUDA code.
+Nabla could ship an app-local bundle of selected CUDA runtime headers and make it available to runtime discovery. That model is allowed by the NVIDIA CUDA EULA for the components listed in Attachment A. Nabla intentionally does not bundle these headers. Because of that, end users should prefer an official CUDA runtime/header package for production machines. An installed toolkit also works, but the full toolkit is mainly for developers compiling Nabla or SDK-typed CUDA code.
 
 NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See:
 
@@ -149,7 +151,7 @@ CuPy documents the same NVRTC issue for CUDA 12.2+. Their install docs say: "On
 
 ## Related Designs
 
-The split follows the same boundary pattern used by mature GPU projects: default headers avoid vendor SDK requirements, native access is explicit, and implementation details stay outside the default public API.
+The split follows the same boundary pattern used by mature GPU projects: public/common headers avoid vendor SDK requirements, vendor SDK access is explicit, and implementation details stay outside the public API.
 
 - OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79
 - OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index 52fdb43539..f760b78a1c 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -23,10 +23,10 @@ using namespace nbl::video;
 	core::smart_refctd_ptr<IDeviceMemoryAllocation> vulkanMemory,
 	core::smart_refctd_ptr<ISemaphore> vulkanSemaphore)
 {
-	auto cudaMemory = cuda_native::createExportableMemory(cudaDevice, {
+	auto cudaMemory = cudaDevice.createExportableMemory({
 		.size = 4096,
 		.alignment = 4096,
-		.location = CU_MEM_LOCATION_TYPE_DEVICE,
+		.locationType = CU_MEM_LOCATION_TYPE_DEVICE,
 	});
 	if (!cudaMemory)
 		return false;
@@ -48,7 +48,7 @@ using namespace nbl::video;
 
 bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device)
 {
-	auto& cuda = cuda_native::getCUDAFunctionTable(handler);
+	auto& cuda = handler.getCUDAFunctionTable();
 
 	CUcontext context = nullptr;
 	if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS)
@@ -151,11 +151,15 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 		if (!cudaFp16HeaderCompileProbe(*handler))
 			return false;
 
-		const auto& devices = nbl::video::cuda_native::getAvailableDevices(*handler);
-		if (devices.empty())
+		int deviceCount = 0;
+		if (handler->getCUDAFunctionTable().pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount==0)
 			return true;
 
-		return cudaDriverRoundtrip(*handler, devices.front().handle);
+		CUdevice device = {};
+		if (handler->getCUDAFunctionTable().pcuDeviceGet(&device,0)!=CUDA_SUCCESS)
+			return false;
+
+		return cudaDriverRoundtrip(*handler, device);
 	}
 
 	void workLoopBody() override {}
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index 2ed02a6282..426b900f4d 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -3,6 +3,26 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 #include "nbl/video/CUDAInterop.h"
 
+namespace nbl::video
+{
+
+CCUDADevice::E_VIRTUAL_ARCHITECTURE CCUDADevice::getVirtualArchitecture() const
+{
+	return m_virtualArchitecture;
+}
+
+core::SRange<const char* const> CCUDADevice::geDefaultCompileOptions() const
+{
+	return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()};
+}
+
+const CCUDAHandler* CCUDADevice::getHandler() const
+{
+	return m_handler.get();
+}
+
+}
+
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #include "CUDAInteropNativeState.hpp"
 
@@ -13,6 +33,30 @@
 namespace nbl::video
 {
 
+namespace
+{
+
+constexpr const char* VirtualArchCompileOption[] = {
+	"-arch=compute_30",
+	"-arch=compute_32",
+	"-arch=compute_35",
+	"-arch=compute_37",
+	"-arch=compute_50",
+	"-arch=compute_52",
+	"-arch=compute_53",
+	"-arch=compute_60",
+	"-arch=compute_61",
+	"-arch=compute_62",
+	"-arch=compute_70",
+	"-arch=compute_72",
+	"-arch=compute_75",
+	"-arch=compute_80"
+};
+
+static_assert(sizeof(VirtualArchCompileOption)/sizeof(*VirtualArchCompileOption)==CCUDADevice::EVA_COUNT);
+
+}
+
 CCUDADevice::CCUDADevice(
 	core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, 
 	IPhysicalDevice* const vulkanDevice, 
@@ -22,7 +66,6 @@ CCUDADevice::CCUDADevice(
 	m_logger(vulkanDevice->getDebugCallback()->getLogger()),
   m_defaultCompileOptions(), 
   m_vulkanConnection(std::move(vulkanConnection)), 
-  m_physicalDevice(vulkanDevice), 
   m_virtualArchitecture(virtualArchitecture),
 	m_handler(std::move(handler)),
 	m_native(std::move(nativeState))
@@ -30,11 +73,11 @@ CCUDADevice::CCUDADevice(
 	assert(m_native);
 
 	m_defaultCompileOptions.push_back("--std=c++14");
-	m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]);
+	m_defaultCompileOptions.push_back(VirtualArchCompileOption[m_virtualArchitecture]);
 	m_defaultCompileOptions.push_back("-dc");
 	m_defaultCompileOptions.push_back("-use_fast_math");
 
-  const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
+  const auto& cu = m_handler->getCUDAFunctionTable();
 	
 	if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle)))
 		assert(false);
@@ -92,7 +135,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 {
 	const auto handler = device.getHandler();
 	const auto& native = cuda_native::SAccess::native(device);
-	const auto& cu = cuda_native::getCUDAFunctionTable(*handler);
+	const auto& cu = handler->getCUDAFunctionTable();
 	
 	CUdeviceptr ptr = 0;
 	if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err)
@@ -140,7 +183,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 	if (params.granularSize==0u)
 		return nullptr;
 
-	auto& cu = cuda_native::getCUDAFunctionTable(*handler);
+	auto& cu = handler->getCUDAFunctionTable();
 	
 #ifdef _WIN32
 	OBJECT_ATTRIBUTES metadata = {
@@ -199,7 +242,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 
 core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem)
 {
-	const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
+	const auto& cu = m_handler->getCUDAFunctionTable();
 	const auto handleType = mem->getCreationParams().externalHandleType;
 
 	if (!handleType) return nullptr;
@@ -230,7 +273,7 @@ core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(co
 
 core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr<ISemaphore>&& sema)
 {
-	auto& cu = cuda_native::getCUDAFunctionTable(*m_handler);
+	auto& cu = m_handler->getCUDAFunctionTable();
 	auto handleType = sema->getCreationParams().externalHandleTypes.value;
 
 	if (!handleType)
@@ -263,7 +306,7 @@ core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaph
 
 CCUDADevice::~CCUDADevice()
 {
-	if (!cuda_native::defaultHandleResult(*m_handler, cuda_native::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context)))
+	if (!cuda_native::defaultHandleResult(*m_handler, m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context)))
 		assert(false);
 }
 
@@ -285,7 +328,6 @@ CCUDADevice::CCUDADevice(
 	core::smart_refctd_ptr<CCUDAHandler>&& handler)
 	: m_logger(nullptr)
 	, m_vulkanConnection(std::move(vulkanConnection))
-	, m_physicalDevice(vulkanDevice)
 	, m_virtualArchitecture(virtualArchitecture)
 	, m_handler(std::move(handler))
 	, m_native(std::move(nativeState))
diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp
index 6c77736628..6458fe5af3 100644
--- a/src/nbl/video/CCUDAExportableMemory.cpp
+++ b/src/nbl/video/CCUDAExportableMemory.cpp
@@ -52,7 +52,7 @@ core::smart_refctd_ptr<IDeviceMemoryAllocation> CCUDAExportableMemory::exportAsM
 
 CCUDAExportableMemory::~CCUDAExportableMemory()
 {
-	const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
+	const auto& cu = m_device->getHandler()->getCUDAFunctionTable();
 
 	if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize)))
 		assert(false);
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 4168612e61..9e40942914 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -345,6 +345,16 @@ uint32_t CCUDAHandler::getBuildCUDAVersion()
 	return CUDA_VERSION;
 }
 
+const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const
+{
+	return cuda_native::SAccess::native(*this).cuda;
+}
+
+const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const
+{
+	return cuda_native::SAccess::native(*this).nvrtc;
+}
+
 namespace cuda_native
 {
 
@@ -867,21 +877,6 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 namespace cuda_native
 {
 
-const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler)
-{
-	return SAccess::native(handler).cuda;
-}
-
-const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler)
-{
-	return SAccess::native(handler).nvrtc;
-}
-
-const core::vector<SCUDADeviceInfo>& getAvailableDevices(const CCUDAHandler& handler)
-{
-	return SAccess::native(handler).availableDevices;
-}
-
 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
 {
 #if defined(_NBL_WINDOWS_API_)
@@ -1105,6 +1100,16 @@ uint32_t CCUDAHandler::getBuildCUDAVersion()
 	return 0u;
 }
 
+const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const
+{
+	std::abort();
+}
+
+const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const
+{
+	std::abort();
+}
+
 core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr<system::ILogger>&&)
 {
 	return nullptr;
diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp
index 404323a365..168fce511e 100644
--- a/src/nbl/video/CCUDAImportedMemory.cpp
+++ b/src/nbl/video/CCUDAImportedMemory.cpp
@@ -33,7 +33,7 @@ bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuff
   bufferDesc.size = m_src->getAllocationSize();
 
   CUdeviceptr nativeMappedBuffer = 0;
-  const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
+  const auto& cu = m_device->getHandler()->getCUDAFunctionTable();
   const auto result = cu.pcuExternalMemoryGetMappedBuffer(&nativeMappedBuffer, m_native->handle, &bufferDesc);
   if (!cuda_native::defaultHandleResult(*m_device->getHandler(),result))
 	  return false;
@@ -44,7 +44,7 @@ bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuff
 
 CCUDAImportedMemory::~CCUDAImportedMemory()
 {
-  auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
+  auto& cu = m_device->getHandler()->getCUDAFunctionTable();
 	if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle)))
 		assert(false);
 }
diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp
index 4fce78788c..3296d16a60 100644
--- a/src/nbl/video/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/video/CCUDAImportedSemaphore.cpp
@@ -24,7 +24,7 @@ cuda_interop::SCUexternalSemaphore CCUDAImportedSemaphore::getInternalObject() c
 
 CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
 {
-	auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler());
+	auto& cu = m_device->getHandler()->getCUDAFunctionTable();
 	if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle)))
 		assert(false);
 }

From fce838b8caaa587d5dcde0dae436ad1281221ef8 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sat, 9 May 2026 14:20:06 +0200
Subject: [PATCH 30/51] Polish CUDA interop cleanup

---
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   |  7 +++
 include/nbl/video/CCUDADevice.h               | 12 ++++-
 include/nbl/video/EApiType.h                  | 33 ++++++++++++-
 src/nbl/CMakeLists.txt                        |  1 -
 src/nbl/video/CCUDADevice.cpp                 | 48 ++++++-------------
 src/nbl/video/CCUDAExportableMemory.cpp       | 10 ++--
 src/nbl/video/CCUDAImportedMemory.cpp         |  3 +-
 src/nbl/video/CCUDAImportedSemaphore.cpp      |  3 +-
 src/nbl/video/CUDAInteropNativeState.hpp      |  1 -
 src/nbl/video/EApiType.cpp                    | 37 --------------
 10 files changed, 70 insertions(+), 85 deletions(-)
 delete mode 100644 src/nbl/video/EApiType.cpp

diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index 77d248dee2..2ec4b723c0 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -42,6 +42,7 @@
 
 #include "cuda.h"
 #include "nvrtc.h"
+#include <cassert>
 #include <type_traits>
 #if CUDA_VERSION < 13000
 	#error "Need CUDA 13.0 SDK or higher."
@@ -262,6 +263,12 @@ inline bool isBuildCUDAVersionCompatible()
 NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
 NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
 NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
+#define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \
+	do { \
+		const auto nblCudaInteropResult = (expr); \
+		if (!::nbl::video::cuda_native::defaultHandleResult(*(handler), nblCudaInteropResult)) \
+			assert(false); \
+	} while(0)
 NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
 NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index 0f8bd015ed..5acfd35831 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -10,6 +10,7 @@
 #include "nbl/video/CCUDAImportedMemory.h"
 #include "nbl/video/CCUDAImportedSemaphore.h"
 
+#include <array>
 #include <memory>
 #include <vector>
 
@@ -66,7 +67,15 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 			uint32_t locationType;
 		};
 
-		size_t roundToGranularity(uint32_t locationType, size_t size) const;
+		inline size_t roundToGranularity(uint32_t locationType, size_t size) const
+		{
+			if (locationType>=m_allocationGranularity.size())
+				return 0u;
+			const auto granularity = m_allocationGranularity[locationType];
+			if (size==0u || granularity==0u)
+				return 0u;
+			return ((size - 1) / granularity + 1) * granularity;
+		}
 
 		core::smart_refctd_ptr<CCUDAExportableMemory> createExportableMemory(SExportableMemoryCreationParams&& params);
 		core::smart_refctd_ptr<CCUDAImportedMemory> importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem);
@@ -83,6 +92,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 		const system::logger_opt_ptr m_logger;
 		std::vector<const char*> m_defaultCompileOptions;
 		core::smart_refctd_ptr<CVulkanConnection> m_vulkanConnection;
+		std::array<size_t,5> m_allocationGranularity = {};
 		E_VIRTUAL_ARCHITECTURE m_virtualArchitecture;
 
 		core::smart_refctd_ptr<CCUDAHandler> m_handler;
diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h
index 44a31ecf90..9b1a79e4d4 100644
--- a/include/nbl/video/EApiType.h
+++ b/include/nbl/video/EApiType.h
@@ -3,6 +3,14 @@
 
 #include "nbl/core/declarations.h"
 #include <cstdint>
+#ifdef _WIN32
+	#ifndef WIN32_LEAN_AND_MEAN
+		#define WIN32_LEAN_AND_MEAN
+	#endif
+	#include <windows.h>
+#else
+	#include <unistd.h>
+#endif
 
 namespace nbl::video
 {
@@ -28,8 +36,29 @@ constexpr external_handle_t ExternalHandleNull = nullptr;
 constexpr external_handle_t ExternalHandleNull = -1;
 #endif
 
-NBL_API2 bool CloseExternalHandle(external_handle_t handle);
-NBL_API2 external_handle_t DuplicateExternalHandle(external_handle_t handle);
+inline bool CloseExternalHandle(external_handle_t handle)
+{
+#ifdef _WIN32
+	return CloseHandle(handle);
+#else
+	return close(handle)==0;
+#endif
+}
+
+inline external_handle_t DuplicateExternalHandle(external_handle_t handle)
+{
+#ifdef _WIN32
+	HANDLE duplicated = ExternalHandleNull;
+
+	const HANDLE process = GetCurrentProcess();
+	if (!DuplicateHandle(process,handle,process,&duplicated,GENERIC_ALL,0,DUPLICATE_SAME_ACCESS))
+		return ExternalHandleNull;
+
+	return duplicated;
+#else
+	return dup(handle);
+#endif
+}
 
 }
 
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index d56c223e34..317cf3d2a1 100644
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -257,7 +257,6 @@ set(NBL_VIDEO_SOURCES
 	video/IGPUAccelerationStructure.cpp
 	video/IGPUCommandBuffer.cpp
 	video/IQueue.cpp
-	video/EApiType.cpp
 	video/IGPUDescriptorSet.cpp
 	video/IDeviceMemoryAllocation.cpp
 	video/IDeviceMemoryBacked.cpp
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index 426b900f4d..802e224793 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -84,7 +84,7 @@ CCUDADevice::CCUDADevice(
 	if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context)))
 		assert(false);
 
-	for (uint32_t locationType = 0; locationType < m_native->allocationGranularity.size(); ++locationType)
+	for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType)
 	{
 	
     #ifdef _WIN32
@@ -101,7 +101,7 @@ CCUDADevice::CCUDADevice(
       .win32HandleMetaData = &metadata,
   #endif
     };
-		if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)))
+		if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)))
 			assert(false);
 	}
 }
@@ -116,16 +116,6 @@ cuda_interop::SCUcontext CCUDADevice::getContext() const
 	return cuda_native::SCUcontext(cuda_native::SAccess::native(*this).context).asOpaque();
 }
 
-size_t CCUDADevice::roundToGranularity(uint32_t locationType, size_t size) const
-{
-	if (locationType>=m_native->allocationGranularity.size())
-		return 0u;
-	const auto& granularity = m_native->allocationGranularity[locationType];
-	if (granularity==0u)
-		return 0u;
-	return ((size - 1) / granularity + 1) * granularity;
-}
-
 static bool isDeviceLocal(CUmemLocationType location)
 {
 	return location==CU_MEM_LOCATION_TYPE_DEVICE;
@@ -143,8 +133,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 
 	if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err)
 	{
-		if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
-			assert(false);
+		cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size));
 		return err;
 	}
 	
@@ -155,10 +144,8 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 
 	if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err)
 	{
-		if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size)))
-			assert(false);
-		if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)))
-			assert(false);
+		cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size));
+		cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size));
 		return err;
 	}
 
@@ -212,8 +199,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 	if (auto err = cu.pcuMemExportToShareableHandle(&params.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err)
 	{
 		logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR);
-		if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem)))
-			assert(false);
+		cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem));
 		return nullptr;
 	}
 
@@ -221,19 +207,21 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 	{
 		logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
 
-		if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem)))
-			assert(false);
+		cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem));
 
-		bool closeSucceed = CloseExternalHandle(params.externalHandle);
-		assert(closeSucceed);
+		if (!CloseExternalHandle(params.externalHandle))
+			logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR);
 
 		return nullptr;
 	}
 
 	if (const auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err)
 	{
-		bool closeSucceed = CloseExternalHandle(params.externalHandle);
-		assert(closeSucceed);
+		cuda_native::defaultHandleResult(*handler, err);
+		cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(cuda_native::SAccess::deviceptr(*nativeState), params.granularSize));
+		cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(cuda_native::SAccess::deviceptr(*nativeState), params.granularSize));
+		if (!CloseExternalHandle(params.externalHandle))
+			logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR);
 		return nullptr;
 	}
 	
@@ -306,8 +294,7 @@ core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaph
 
 CCUDADevice::~CCUDADevice()
 {
-	if (!cuda_native::defaultHandleResult(*m_handler, m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context)))
-		assert(false);
+	cuda_native::defaultHandleResult(*m_handler, m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context));
 }
 
 }
@@ -347,11 +334,6 @@ cuda_interop::SCUcontext CCUDADevice::getContext() const
 	return {};
 }
 
-size_t CCUDADevice::roundToGranularity(uint32_t, size_t) const
-{
-	return 0u;
-}
-
 core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(SExportableMemoryCreationParams&&)
 {
 	return nullptr;
diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp
index 6458fe5af3..2696fe1ebd 100644
--- a/src/nbl/video/CCUDAExportableMemory.cpp
+++ b/src/nbl/video/CCUDAExportableMemory.cpp
@@ -54,14 +54,12 @@ CCUDAExportableMemory::~CCUDAExportableMemory()
 {
 	const auto& cu = m_device->getHandler()->getCUDAFunctionTable();
 
-	if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize)))
-		assert(false);
+	cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize));
 
-	if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize)))
-		assert(false);
+	cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize));
 
-  bool closeSucceed = CloseExternalHandle(m_params.externalHandle);
-	assert(closeSucceed);
+	if (!CloseExternalHandle(m_params.externalHandle))
+		cuda_native::SAccess::logger(*m_device).log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR);
 
 }
 
diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp
index 168fce511e..cff48931c0 100644
--- a/src/nbl/video/CCUDAImportedMemory.cpp
+++ b/src/nbl/video/CCUDAImportedMemory.cpp
@@ -45,8 +45,7 @@ bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuff
 CCUDAImportedMemory::~CCUDAImportedMemory()
 {
   auto& cu = m_device->getHandler()->getCUDAFunctionTable();
-	if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle)))
-		assert(false);
+	cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle));
 }
 
 }
diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp
index 3296d16a60..6a43fefc4c 100644
--- a/src/nbl/video/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/video/CCUDAImportedSemaphore.cpp
@@ -25,8 +25,7 @@ cuda_interop::SCUexternalSemaphore CCUDAImportedSemaphore::getInternalObject() c
 CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
 {
 	auto& cu = m_device->getHandler()->getCUDAFunctionTable();
-	if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle)))
-		assert(false);
+	cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle));
 }
 }
 
diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp
index 4be8178aa2..743bd10c3e 100644
--- a/src/nbl/video/CUDAInteropNativeState.hpp
+++ b/src/nbl/video/CUDAInteropNativeState.hpp
@@ -31,7 +31,6 @@ struct CCUDADevice::SNativeState
 {
 	CUdevice handle = {};
 	CUcontext context = nullptr;
-	std::array<size_t,5> allocationGranularity = {};
 
 	explicit SNativeState(CUdevice _handle)
 		: handle(_handle)
diff --git a/src/nbl/video/EApiType.cpp b/src/nbl/video/EApiType.cpp
deleted file mode 100644
index d7eadd8b08..0000000000
--- a/src/nbl/video/EApiType.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "nbl/video/EApiType.h"
-
-#ifdef _WIN32
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-#else
-#include <unistd.h>
-#endif
-
-namespace nbl::video
-{
-
-bool CloseExternalHandle(external_handle_t handle)
-{
-#ifdef _WIN32
-	return CloseHandle(handle);
-#else
-	return close(handle)==0;
-#endif
-}
-
-external_handle_t DuplicateExternalHandle(external_handle_t handle)
-{
-#ifdef _WIN32
-	HANDLE duplicated = ExternalHandleNull;
-
-	const HANDLE process = GetCurrentProcess();
-	if (!DuplicateHandle(process,handle,process,&duplicated,GENERIC_ALL,0,DUPLICATE_SAME_ACCESS))
-		return ExternalHandleNull;
-
-	return duplicated;
-#else
-	return dup(handle);
-#endif
-}
-
-}

From 9f2d5feae119315e46db89fbf54e4500694cec56 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sat, 9 May 2026 16:57:14 +0200
Subject: [PATCH 31/51] Simplify CUDA interop native boundary

---
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 100 +++++-------------
 include/nbl/system/DynamicFunctionCaller.h    |  14 ++-
 include/nbl/video/CCUDADevice.h               |   6 --
 include/nbl/video/CCUDAExportableMemory.h     |   7 +-
 include/nbl/video/CCUDAHandler.h              |   6 +-
 include/nbl/video/CCUDAImportedMemory.h       |   7 +-
 include/nbl/video/CCUDAImportedSemaphore.h    |   6 --
 include/nbl/video/CUDAInteropHandles.h        |  42 ++++++++
 src/nbl/ext/CUDAInterop/README.md             |  28 +++--
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |  10 +-
 src/nbl/video/CCUDADevice.cpp                 |  55 +++++-----
 src/nbl/video/CCUDAExportableMemory.cpp       |   4 +-
 src/nbl/video/CCUDAHandler.cpp                |  34 +++---
 src/nbl/video/CCUDAImportedMemory.cpp         |   4 +-
 src/nbl/video/CCUDAImportedSemaphore.cpp      |   2 +-
 src/nbl/video/CUDAInteropNativeState.hpp      |  45 --------
 16 files changed, 169 insertions(+), 201 deletions(-)

diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index 2ec4b723c0..0e08fb2b97 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -39,21 +39,25 @@
 #include "nbl/video/CUDAInterop.h"
 #include "nbl/asset/ICPUBuffer.h"
 #include "nbl/system/DynamicFunctionCaller.h"
-
 #include "cuda.h"
 #include "nvrtc.h"
-#include <cassert>
-#include <type_traits>
-#if CUDA_VERSION < 13000
-	#error "Need CUDA 13.0 SDK or higher."
-#endif
-
 namespace nbl::video::cuda_native
 {
 
 inline constexpr int MinimumCUDADriverVersion = 13000;
 inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000;
+static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or higher.");
+
+/*
+	The CUDA/NVRTC table classes below contain the calls used and tested by Nabla's interop implementation.
+	After including this SDK opt-in header, consumer code can load extra Driver API or NVRTC symbols from the
+	same loaded libraries without changing Nabla's ABI:
+
+	auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall);
 
+	The requested symbol must be declared by the CUDA SDK headers visible to this translation unit because the
+	helper uses decltype(cuNewCall) to preserve the native function signature.
+*/
 using LibLoader = system::DefaultFuncPtrLoader;
 
 NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader
@@ -184,73 +188,33 @@ struct SPTXResult
 	nvrtcResult result;
 };
 
-template<typename Opaque, typename Native>
-concept cuda_opaque_handle =
-	std::is_trivially_copyable_v<Opaque> &&
-	std::is_trivially_copyable_v<Native> &&
-	sizeof(Opaque)==sizeof(Native) &&
-	alignof(Opaque)==alignof(Native);
-/*
-	Map Nabla opaque handles to CUDA SDK handle types.
-
-	This is deliberately small. It is not an attempt to wrap CUDA. It only gives SDK opt-in code a convenient
-	way to pass Nabla-owned opaque handles to CUDA C APIs while checking that the public opaque type has the same
-	layout as the CUDA type visible in this translation unit. If a future SDK changes one of these handle layouts,
-	the SDK opt-in build fails here instead of letting ABI drift propagate through packaged Nabla headers.
-*/
-template<typename Opaque>
-struct SOpaqueCUDAType;
-
-template<> struct SOpaqueCUDAType<cuda_interop::SCUdevice> { using type = CUdevice; };
-template<> struct SOpaqueCUDAType<cuda_interop::SCUcontext> { using type = CUcontext; };
-template<> struct SOpaqueCUDAType<cuda_interop::SCUdeviceptr> { using type = CUdeviceptr; };
-template<> struct SOpaqueCUDAType<cuda_interop::SCUexternalMemory> { using type = CUexternalMemory; };
-template<> struct SOpaqueCUDAType<cuda_interop::SCUexternalSemaphore> { using type = CUexternalSemaphore; };
 /*
 	CUDA SDK view of an SDK-free opaque handle.
 
 	The conversions are intentionally available only after including this header. Public Nabla headers expose
-	only the opaque SCU* values. Once a consumer opts in, SNativeHandle restores the CUDA spelling and ergonomics
-	for raw Driver API calls without adding accessors to every interop operation.
+	only the opaque SCU* values. Once a consumer opts in, the aliases below restore the CUDA spelling and
+	ergonomics for raw Driver API calls without adding accessors to every interop operation. Each alias maps one
+	Nabla opaque handle to the matching CUDA SDK handle and validates size/alignment against the SDK selected by
+	this opt-in translation unit.
 */
-template<typename Opaque>
-struct SNativeHandle
-{
-	using cuda_t = typename SOpaqueCUDAType<Opaque>::type;
-	static_assert(cuda_opaque_handle<Opaque,cuda_t>);
-
-	SNativeHandle() = default;
-	SNativeHandle(const SNativeHandle&) = default;
-	SNativeHandle(const cuda_t& native) { operator=(native); }
-	SNativeHandle(const Opaque& opaque) { operator=(opaque); }
-
-	SNativeHandle& operator=(const SNativeHandle&) = default;
-	SNativeHandle& operator=(const cuda_t& native) { value = native; return *this; }
-	SNativeHandle& operator=(const Opaque& opaque) { operator Opaque&() = opaque; return *this; }
+using SCUdevice = cuda_interop::SNativeHandle<cuda_interop::SCUdevice, CUdevice>;
+using SCUcontext = cuda_interop::SNativeHandle<cuda_interop::SCUcontext, CUcontext>;
+using SCUdeviceptr = cuda_interop::SNativeHandle<cuda_interop::SCUdeviceptr, CUdeviceptr>;
+using SCUexternalMemory = cuda_interop::SNativeHandle<cuda_interop::SCUexternalMemory, CUexternalMemory>;
+using SCUexternalSemaphore = cuda_interop::SNativeHandle<cuda_interop::SCUexternalSemaphore, CUexternalSemaphore>;
 
-	operator cuda_t&() { return value; }
-	operator const cuda_t&() const { return value; }
-	operator Opaque&() { return reinterpret_cast<Opaque&>(value); }
-	operator const Opaque&() const { return reinterpret_cast<const Opaque&>(value); }
-
-	Opaque* opaque() { return &static_cast<Opaque&>(*this); }
-	const Opaque* opaque() const { return &static_cast<const Opaque&>(*this); }
-	Opaque asOpaque() const { return static_cast<const Opaque&>(*this); }
-
-	cuda_t value = {};
-};
-
-using SCUdevice = SNativeHandle<cuda_interop::SCUdevice>;
-using SCUcontext = SNativeHandle<cuda_interop::SCUcontext>;
-using SCUdeviceptr = SNativeHandle<cuda_interop::SCUdeviceptr>;
-using SCUexternalMemory = SNativeHandle<cuda_interop::SCUexternalMemory>;
-using SCUexternalSemaphore = SNativeHandle<cuda_interop::SCUexternalSemaphore>;
-
-inline bool isBuildCUDAVersionCompatible()
+/*
+	Check whether this opt-in translation unit uses the exact CUDA SDK version that was used to build Nabla's
+	CUDA interop implementation. Opaque handle layout is checked by SNativeHandle aliases above. This exact
+	version check is a policy helper for SDK-typed code that wants to warn about or reject compatible-but-different
+	SDK headers.
+*/
+inline bool isBuildCUDASDKVersionExactMatch()
 {
-	const auto buildVersion = CCUDAHandler::getBuildCUDAVersion();
+	const auto buildVersion = CCUDAHandler::getBuildCUDASDKVersion();
 	return buildVersion==0u || buildVersion==CUDA_VERSION;
 }
+
 /*
 	Nabla interop API declarations with CUDA SDK signatures.
 
@@ -263,12 +227,6 @@ inline bool isBuildCUDAVersionCompatible()
 NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
 NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
 NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
-#define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \
-	do { \
-		const auto nblCudaInteropResult = (expr); \
-		if (!::nbl::video::cuda_native::defaultHandleResult(*(handler), nblCudaInteropResult)) \
-			assert(false); \
-	} while(0)
 NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
 NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
diff --git a/include/nbl/system/DynamicFunctionCaller.h b/include/nbl/system/DynamicFunctionCaller.h
index cf99be32f0..d5642d3ea9 100644
--- a/include/nbl/system/DynamicFunctionCaller.h
+++ b/include/nbl/system/DynamicFunctionCaller.h
@@ -16,7 +16,7 @@ class DynamicFunctionCallerBase : public core::Unmovable
 {
 	protected:
 		static_assert(std::is_base_of<FuncPtrLoader,FuncPtrLoaderT>::value, "Need a function pointer loader derived from `FuncPtrLoader`");
-		FuncPtrLoaderT loader;
+		mutable FuncPtrLoaderT loader;
 	public:
 		//DynamicFunctionCallerBase() : loader() {}
 		DynamicFunctionCallerBase(DynamicFunctionCallerBase&& other) : DynamicFunctionCallerBase()
@@ -29,6 +29,16 @@ class DynamicFunctionCallerBase : public core::Unmovable
 		}
 		virtual ~DynamicFunctionCallerBase() = default;
 
+		inline bool isLibraryLoaded() const
+		{
+			return loader.isLibraryLoaded();
+		}
+
+		inline void* loadFuncPtr(const char* funcname) const
+		{
+			return loader.loadFuncPtr(funcname);
+		}
+
 		DynamicFunctionCallerBase& operator=(DynamicFunctionCallerBase&& other)
 		{
 			std::swap<FuncPtrLoaderT>(loader, other.loader);
@@ -41,6 +51,8 @@ class DynamicFunctionCallerBase : public core::Unmovable
 
 #define NBL_SYSTEM_IMPL_INIT_DYNLIB_FUNCPTR(FUNC_NAME) ,NBL_CONCATENATE(p , FUNC_NAME)(Base::loader.loadFuncPtr( #FUNC_NAME ))
 #define NBL_SYSTEM_IMPL_SWAP_DYNLIB_FUNCPTR(FUNC_NAME) std::swap(NBL_CONCATENATE(p, FUNC_NAME),other.NBL_CONCATENATE(p, FUNC_NAME));
+// Load an extra function from an already loaded dynamic library without adding it to the generated caller class.
+#define NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(CALLER, FUNC_NAME) nbl::system::DynamicLibraryFunctionPointer<decltype(FUNC_NAME),NBL_CORE_UNIQUE_STRING_LITERAL_TYPE(#FUNC_NAME)>((CALLER).loadFuncPtr(#FUNC_NAME))
 
 #define NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS( CLASS_NAME, FUNC_PTR_LOADER_TYPE, ... ) \
 class CLASS_NAME : public nbl::system::DynamicFunctionCallerBase<FUNC_PTR_LOADER_TYPE>\
diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index 5acfd35831..4658e51a10 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -18,11 +18,6 @@ namespace nbl::video
 {
 class CCUDAHandler;
 
-namespace cuda_native
-{
-struct SAccess;
-}
-
 class NBL_API2 CCUDADevice : public core::IReferenceCounted
 {
 	public:
@@ -84,7 +79,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 
 	private:
 		friend class CCUDAHandler;
-		friend struct cuda_native::SAccess;
 
 		struct SNativeState;
 		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr<SNativeState>&& nativeState, core::smart_refctd_ptr<CCUDAHandler>&& handler);
diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h
index 6243bd8c73..510f483b3b 100644
--- a/include/nbl/video/CCUDAExportableMemory.h
+++ b/include/nbl/video/CCUDAExportableMemory.h
@@ -14,11 +14,6 @@ namespace nbl::video
 {
 class CCUDADevice;
 
-namespace cuda_native
-{
-struct SAccess;
-}
-
 class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 {
 	public:
@@ -37,7 +32,7 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 		core::smart_refctd_ptr<IDeviceMemoryAllocation> exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const;
 
 	private:
-		friend struct cuda_native::SAccess;
+		friend class CCUDADevice;
 
 		struct SNativeState;
 		CCUDAExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, SCachedCreationParams&& params, std::unique_ptr<SNativeState>&& nativeState);
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index db30b08587..a77ab66b68 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -25,7 +25,6 @@ namespace cuda_native
 {
 class CUDA;
 class NVRTC;
-struct SAccess;
 }
 
 namespace cuda_interop
@@ -52,9 +51,10 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 {
 	public:
 		static core::smart_refctd_ptr<CCUDAHandler> create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger);
-		static uint32_t getBuildCUDAVersion();
+		static uint32_t getBuildCUDASDKVersion();
 		const cuda_native::CUDA& getCUDAFunctionTable() const;
 		const cuda_native::NVRTC& getNVRTCFunctionTable() const;
+		inline system::logger_opt_ptr getLogger() const { return m_logger.getOptRawPtr(); }
 
 		inline core::SRange<system::IFile* const> getSTDHeaders()
 		{
@@ -80,8 +80,6 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 		~CCUDAHandler() override;
 
 	private:
-		friend struct cuda_native::SAccess;
-
 		struct SNativeState;
 		CCUDAHandler(std::unique_ptr<SNativeState>&& nativeState, core::vector<core::smart_refctd_ptr<system::IFile>>&& _headers, core::smart_refctd_ptr<system::ILogger>&& _logger);
 
diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h
index 454088b7ae..e2c9bb6db6 100644
--- a/include/nbl/video/CCUDAImportedMemory.h
+++ b/include/nbl/video/CCUDAImportedMemory.h
@@ -12,21 +12,16 @@ namespace nbl::video
 
 class CCUDADevice;
 
-namespace cuda_native
-{
-struct SAccess;
-}
-
 class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
 {
 	public:
 		~CCUDAImportedMemory() override;
 		cuda_interop::SCUexternalMemory getInternalObject() const;
 		bool getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const;
+		bool getMappedBuffer(cuda_interop::SCUdeviceptr& mappedBuffer) const { return getMappedBuffer(&mappedBuffer); }
 
 	private:
 		friend class CCUDADevice;
-		friend struct cuda_native::SAccess;
 
 		struct SNativeState;
 		CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<nbl::video::IDeviceMemoryAllocation> src, std::unique_ptr<SNativeState>&& nativeState);
diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h
index 5a4f28abde..7f2b266383 100644
--- a/include/nbl/video/CCUDAImportedSemaphore.h
+++ b/include/nbl/video/CCUDAImportedSemaphore.h
@@ -15,11 +15,6 @@ namespace nbl::video
 
 class CCUDADevice;
 
-namespace cuda_native
-{
-struct SAccess;
-}
-
 class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted
 {
 	public:
@@ -28,7 +23,6 @@ class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted
 
 	private:
 		friend class CCUDADevice;
-		friend struct cuda_native::SAccess;
 
 		struct SNativeState;
 		CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, core::smart_refctd_ptr<ISemaphore> src, std::unique_ptr<SNativeState>&& nativeState);
diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h
index 741a04c319..92888d3ccf 100644
--- a/include/nbl/video/CUDAInteropHandles.h
+++ b/include/nbl/video/CUDAInteropHandles.h
@@ -6,6 +6,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <type_traits>
 
 namespace nbl::video::cuda_interop
 {
@@ -30,6 +31,47 @@ struct SCUdeviceptr : SOpaqueCUDAHandle<uintptr_t> {};
 struct SCUexternalMemory : SOpaqueCUDAHandle<void*> {};
 struct SCUexternalSemaphore : SOpaqueCUDAHandle<void*> {};
 
+template<typename Opaque, typename Native>
+concept cuda_opaque_handle =
+	std::is_trivially_copyable_v<Opaque> &&
+	std::is_trivially_copyable_v<Native> &&
+	sizeof(Opaque)==sizeof(Native) &&
+	alignof(Opaque)==alignof(Native);
+
+/*
+	Native view of an SDK-free opaque handle.
+
+	This template does not depend on CUDA SDK types by itself. CUDAInteropNative.h binds it to concrete CU* types
+	after the consumer opts into CUDA SDK headers. The layout check keeps the public opaque handle and the native
+	SDK handle compatible in that translation unit while preserving Nabla's SDK-free public headers.
+*/
+template<typename Opaque, typename Native>
+struct SNativeHandle
+{
+	using cuda_t = Native;
+	static_assert(cuda_opaque_handle<Opaque,cuda_t>);
+
+	SNativeHandle() = default;
+	SNativeHandle(const SNativeHandle&) = default;
+	SNativeHandle(const cuda_t& native) { operator=(native); }
+	SNativeHandle(const Opaque& opaque) { operator=(opaque); }
+
+	SNativeHandle& operator=(const SNativeHandle&) = default;
+	SNativeHandle& operator=(const cuda_t& native) { value = native; return *this; }
+	SNativeHandle& operator=(const Opaque& opaque) { operator Opaque&() = opaque; return *this; }
+
+	operator cuda_t&() { return value; }
+	operator const cuda_t&() const { return value; }
+	operator Opaque&() { return reinterpret_cast<Opaque&>(value); }
+	operator const Opaque&() const { return reinterpret_cast<const Opaque&>(value); }
+
+	Opaque* opaque() { return &static_cast<Opaque&>(*this); }
+	const Opaque* opaque() const { return &static_cast<const Opaque&>(*this); }
+	Opaque asOpaque() const { return static_cast<const Opaque&>(*this); }
+
+	cuda_t value = {};
+};
+
 }
 
 #endif
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index a55bafbb9f..231658e949 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -6,7 +6,7 @@
 - The public Nabla headers do not include `cuda.h`, `nvrtc.h`, or other CUDA SDK headers. A consumer that only links `Nabla::Nabla` does not need a CUDA SDK install just to parse Nabla headers.
 - CUDA native state is stored behind incomplete `SNativeState` members in Nabla classes. Public headers expose fixed-layout opaque value handles from `nbl/video/CUDAInteropHandles.h`.
 - `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the SDK opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop.
-- `CUDAInteropNative.h` is the only opt-in header that includes CUDA SDK headers. It maps Nabla opaque handles to CUDA SDK types with `cuda_native::SNativeHandle`.
+- `CUDAInteropNative.h` is the opt-in SDK boundary. It includes CUDA SDK headers and aliases Nabla opaque handles to CUDA SDK types through `cuda_interop::SNativeHandle`.
 
 ## CMake Usage
 
@@ -55,8 +55,11 @@ This affects SDK opt-in compilation and generated runtime header discovery only.
 auto handler = nbl::video::CCUDAHandler::create(system, std::move(logger));
 auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDevice);
 
-if (!nbl::video::cuda_native::isBuildCUDAVersionCompatible())
-    return false;
+const bool exactBuildSDK = nbl::video::cuda_native::isBuildCUDASDKVersionExactMatch();
+if (!exactBuildSDK)
+{
+    // Warn here, or return false if this application requires exact same-SDK policy.
+}
 
 auto memory = cudaDevice->createExportableMemory({
     .size = size,
@@ -66,7 +69,7 @@ auto memory = cudaDevice->createExportableMemory({
 
 nbl::video::cuda_native::SCUdeviceptr mapped;
 if (importedMemory)
-    importedMemory->getMappedBuffer(mapped.opaque());
+    importedMemory->getMappedBuffer(mapped);
 
 CUdeviceptr rawMapped = mapped;
 CUdeviceptr rawExported = nbl::video::cuda_native::SCUdeviceptr(memory->getDeviceptr());
@@ -86,16 +89,25 @@ auto compile = nbl::video::cuda_native::compileDirectlyToPTX(
 SDK opt-in access is not a full CUDA wrapper. It is the glue between Nabla resource lifetime and raw CUDA interop:
 
 - `CCUDAHandler::getCUDAFunctionTable` and `CCUDAHandler::getNVRTCFunctionTable` expose the loaded Driver API and NVRTC tables after SDK opt-in.
-- `cuda_native::SNativeHandle<T>` converts between SDK-free Nabla opaque handles and CUDA SDK handles such as `CUdeviceptr`.
+- The default tables contain the CUDA/NVRTC calls used and tested by Nabla. SDK opt-in code can load extra symbols from the same dynamic table without changing Nabla's ABI. The symbol name must be declared by the CUDA SDK headers visible to that translation unit:
+
+```cpp
+auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall);
+if (pcuNewCall)
+    pcuNewCall(...);
+```
+
+- `cuda_interop::SNativeHandle<Opaque, Native>` converts between SDK-free Nabla opaque handles and CUDA SDK handles such as `CUdeviceptr`. The template itself is SDK-free. `CUDAInteropNative.h` only provides CUDA-typed aliases.
 - CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI.
-- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. SDK opt-in code can pass `cuda_native::SCUdeviceptr::opaque()` and then use the wrapper as `CUdeviceptr`.
+- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. SDK opt-in code can pass `cuda_native::SCUdeviceptr` directly and then use it as `CUdeviceptr`.
 - `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`.
+- `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule.
 
 Smoke examples:
 
 - `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` checks that `Nabla::Nabla` headers stay SDK-free.
 - `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` checks `Nabla::Nabla` package usage without SDK opt-in.
-- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks SDK opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, and raw interop usage.
+- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks SDK opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, extra dynamic symbol loading, and raw interop usage.
 
 ## ABI
 
@@ -104,7 +116,7 @@ Smoke examples:
 - Opaque handle types are small trivially-copyable byte arrays with fixed size/alignment chosen to match CUDA SDK handle storage. The SDK opt-in header validates this with `static_assert`s against the SDK used by the consumer.
 - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state.
 - SDK-sized arrays, CUDA enum storage, and CUDA implementation headers stay private to Nabla.
-- A consumer can build SDK opt-in code with its own compatible SDK independently from the SDK used to build Nabla. SDK-typed code can check `cuda_native::isBuildCUDAVersionCompatible()` when exact CUDA SDK version matching is required.
+- A consumer can build SDK opt-in code with its own compatible SDK independently from the SDK used to build Nabla. SDK-typed code can check `cuda_native::isBuildCUDASDKVersionExactMatch()` when exact CUDA SDK version matching is required.
 - Runtime include-option construction is header-only and is not part of the exported ABI.
 - The loaded CUDA driver and NVRTC runtime are validated at runtime.
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index f760b78a1c..79e85555b7 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -37,7 +37,7 @@ using namespace nbl::video;
 
 	cuda_native::SCUdeviceptr mappedVulkanMemory;
 	if (importedFromVulkan)
-		importedFromVulkan->getMappedBuffer(mappedVulkanMemory.opaque());
+		importedFromVulkan->getMappedBuffer(mappedVulkanMemory);
 
 	const CUdeviceptr cudaDevicePtr = cuda_native::SCUdeviceptr(cudaMemory->getDeviceptr());
 	CUexternalSemaphore cudaSemaphore = nullptr;
@@ -127,8 +127,7 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 			return false;
 
 		static_assert(std::is_same_v<nbl::video::cuda_native::SCUdevice::cuda_t, CUdevice>);
-		if (!nbl::video::cuda_native::isBuildCUDAVersionCompatible())
-			return false;
+		[[maybe_unused]] const bool exactBuildSDK = nbl::video::cuda_native::isBuildCUDASDKVersionExactMatch();
 
 		#ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON
 		const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment({}, {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON});
@@ -148,6 +147,11 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 		if (!handler)
 			return true;
 
+		auto pcuDriverGetVersion = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuDriverGetVersion);
+		int loadedDriverVersion = 0;
+		if (!pcuDriverGetVersion || pcuDriverGetVersion(&loadedDriverVersion)!=CUDA_SUCCESS || loadedDriverVersion==0)
+			return false;
+
 		if (!cudaFp16HeaderCompileProbe(*handler))
 			return false;
 
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index 802e224793..1c73068a6d 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -55,6 +55,15 @@ constexpr const char* VirtualArchCompileOption[] = {
 
 static_assert(sizeof(VirtualArchCompileOption)/sizeof(*VirtualArchCompileOption)==CCUDADevice::EVA_COUNT);
 
+static CUmemAllocationHandleType getAllocationHandleType()
+{
+#ifdef _WIN32
+	return CU_MEM_HANDLE_TYPE_WIN32;
+#else
+	return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+#endif
+}
+
 }
 
 CCUDADevice::CCUDADevice(
@@ -95,7 +104,7 @@ CCUDADevice::CCUDADevice(
 
 	  const auto prop = CUmemAllocationProp{
       .type = CU_MEM_ALLOCATION_TYPE_PINNED,
-      .requestedHandleTypes = cuda_native::SAccess::allocationHandleType(),
+      .requestedHandleTypes = getAllocationHandleType(),
       .location = { .type = static_cast<CUmemLocationType>(locationType), .id = m_native->handle },
   #ifdef _WIN32
       .win32HandleMetaData = &metadata,
@@ -108,12 +117,12 @@ CCUDADevice::CCUDADevice(
 
 cuda_interop::SCUdevice CCUDADevice::getInternalObject() const
 {
-	return cuda_native::SCUdevice(cuda_native::SAccess::native(*this).handle).asOpaque();
+	return cuda_native::SCUdevice(m_native->handle);
 }
 
 cuda_interop::SCUcontext CCUDADevice::getContext() const
 {
-	return cuda_native::SCUcontext(cuda_native::SAccess::native(*this).context).asOpaque();
+	return cuda_native::SCUcontext(m_native->context);
 }
 
 static bool isDeviceLocal(CUmemLocationType location)
@@ -121,11 +130,9 @@ static bool isDeviceLocal(CUmemLocationType location)
 	return location==CU_MEM_LOCATION_TYPE_DEVICE;
 }
 
-static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory)
+static CUresult reserveAddressAndMapMemory(const CCUDAHandler& handler, CUdevice nativeDevice, CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory)
 {
-	const auto handler = device.getHandler();
-	const auto& native = cuda_native::SAccess::native(device);
-	const auto& cu = handler->getCUDAFunctionTable();
+	const auto& cu = handler.getCUDAFunctionTable();
 	
 	CUdeviceptr ptr = 0;
 	if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err)
@@ -133,19 +140,19 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 
 	if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err)
 	{
-		cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size));
+		cuda_native::defaultHandleResult(handler, cu.pcuMemAddressFree(ptr, size));
 		return err;
 	}
 	
 	CUmemAccessDesc accessDesc = {
-		.location = { .type = location, .id = native.handle },
+		.location = { .type = location, .id = nativeDevice },
 		.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE,
 	};
 
 	if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err)
 	{
-		cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size));
-		cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size));
+		cuda_native::defaultHandleResult(handler, cu.pcuMemUnmap(ptr, size));
+		cuda_native::defaultHandleResult(handler, cu.pcuMemAddressFree(ptr, size));
 		return err;
 	}
 
@@ -157,8 +164,6 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept
 core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemory(SExportableMemoryCreationParams&& inParams)
 {
 	const auto handler = getHandler();
-	auto& native = cuda_native::SAccess::native(*this);
-	auto logger = cuda_native::SAccess::logger(*this);
 	const auto location = static_cast<CUmemLocationType>(inParams.locationType);
 
 	CCUDAExportableMemory::SCachedCreationParams params = {
@@ -180,37 +185,37 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 
 	 const auto prop = CUmemAllocationProp{
 		.type = CU_MEM_ALLOCATION_TYPE_PINNED,
-		.requestedHandleTypes = cuda_native::SAccess::allocationHandleType(),
-		.location = { .type = location, .id = native.handle },
+		.requestedHandleTypes = getAllocationHandleType(),
+		.location = { .type = location, .id = m_native->handle },
 #ifdef _WIN32
 		.win32HandleMetaData = &metadata,
 #endif
 	};
 
-	auto nativeState = cuda_native::SAccess::makeExportableMemoryNativeState();
+	auto nativeState = std::make_unique<CCUDAExportableMemory::SNativeState>();
 
 	CUmemGenericAllocationHandle mem;
 	if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err)
 	{
-		logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR);
+		m_logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR);
 		return nullptr;
 	}
 	
 	if (auto err = cu.pcuMemExportToShareableHandle(&params.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err)
 	{
-		logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR);
+		m_logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR);
 		cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem));
 		return nullptr;
 	}
 
-	if (const auto err = reserveAddressAndMapMemory(*this,&cuda_native::SAccess::deviceptr(*nativeState), params.granularSize, params.alignment, location, mem); CUDA_SUCCESS != err)
+	if (const auto err = reserveAddressAndMapMemory(*handler,m_native->handle,&nativeState->ptr, params.granularSize, params.alignment, location, mem); CUDA_SUCCESS != err)
 	{
-		logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
+		m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
 
 		cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem));
 
 		if (!CloseExternalHandle(params.externalHandle))
-			logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR);
+			m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR);
 
 		return nullptr;
 	}
@@ -218,14 +223,14 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 	if (const auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err)
 	{
 		cuda_native::defaultHandleResult(*handler, err);
-		cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(cuda_native::SAccess::deviceptr(*nativeState), params.granularSize));
-		cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(cuda_native::SAccess::deviceptr(*nativeState), params.granularSize));
+		cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(nativeState->ptr, params.granularSize));
+		cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(nativeState->ptr, params.granularSize));
 		if (!CloseExternalHandle(params.externalHandle))
-			logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR);
+			m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR);
 		return nullptr;
 	}
 	
-	return cuda_native::SAccess::makeExportableMemory(core::smart_refctd_ptr<CCUDADevice>(this),std::move(params),std::move(nativeState));
+	return CCUDAExportableMemory::create(core::smart_refctd_ptr<CCUDADevice>(this),std::move(params),std::move(nativeState));
 }
 
 core::smart_refctd_ptr<CCUDAImportedMemory> CCUDADevice::importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>&& mem)
diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp
index 2696fe1ebd..929453b3bd 100644
--- a/src/nbl/video/CCUDAExportableMemory.cpp
+++ b/src/nbl/video/CCUDAExportableMemory.cpp
@@ -59,13 +59,13 @@ CCUDAExportableMemory::~CCUDAExportableMemory()
 	cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize));
 
 	if (!CloseExternalHandle(m_params.externalHandle))
-		cuda_native::SAccess::logger(*m_device).log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR);
+		m_device->getHandler()->getLogger().log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR);
 
 }
 
 cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const
 {
-	return cuda_native::SCUdeviceptr(m_native->ptr).asOpaque();
+	return cuda_native::SCUdeviceptr(m_native->ptr);
 }
 
 }
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 9e40942914..ce9a8aa46b 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -340,19 +340,19 @@ CCUDAHandler::CCUDAHandler(
 
 CCUDAHandler::~CCUDAHandler() = default;
 
-uint32_t CCUDAHandler::getBuildCUDAVersion()
+uint32_t CCUDAHandler::getBuildCUDASDKVersion()
 {
 	return CUDA_VERSION;
 }
 
 const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const
 {
-	return cuda_native::SAccess::native(*this).cuda;
+	return m_native->cuda;
 }
 
 const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const
 {
-	return cuda_native::SAccess::native(*this).nvrtc;
+	return m_native->nvrtc;
 }
 
 namespace cuda_native
@@ -726,21 +726,23 @@ bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
 
 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result)
 {
-	return defaultHandleResult(result,SAccess::logger(handler));
+	return defaultHandleResult(result,handler.getLogger());
 }
 
 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
 {
+	const auto& nvrtc = handler.getNVRTCFunctionTable();
+	const auto logger = handler.getLogger();
 	switch (result)
 	{
 		case NVRTC_SUCCESS:
 			return true;
 			break;
 		default:
-			if (SAccess::native(handler).nvrtc.pnvrtcGetErrorString)
-				SAccess::logger(handler).log("%s\n",system::ILogger::ELL_ERROR,SAccess::native(handler).nvrtc.pnvrtcGetErrorString(result));
+			if (nvrtc.pnvrtcGetErrorString)
+				logger.log("%s\n",system::ILogger::ELL_ERROR,nvrtc.pnvrtcGetErrorString(result));
 			else
-				SAccess::logger(handler).log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR);
+				logger.log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR);
 			break;
 	}
 	_NBL_DEBUG_BREAK_IF(true);
@@ -886,31 +888,33 @@ nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string
 #else
 #error "Unsuported Platform"
 #endif
-	return SAccess::native(handler).nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames);
+	return handler.getNVRTCFunctionTable().pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames);
 }
 
 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options)
 {
-	return SAccess::native(handler).nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin());
+	return handler.getNVRTCFunctionTable().pnvrtcCompileProgram(prog,options.size(),options.begin());
 }
 
 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log)
 {
 	size_t _size = 0ull;
-	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetProgramLogSize(prog, &_size);
+	const auto& nvrtc = handler.getNVRTCFunctionTable();
+	nvrtcResult sizeRes = nvrtc.pnvrtcGetProgramLogSize(prog, &_size);
 	if (sizeRes != NVRTC_SUCCESS)
 		return sizeRes;
 	if (_size == 0ull)
 		return NVRTC_ERROR_INVALID_INPUT;
 
 	log.resize(_size);
-	return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data());
+	return nvrtc.pnvrtcGetProgramLog(prog,log.data());
 }
 
 SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
 {
 	size_t _size = 0ull;
-	nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size);
+	const auto& nvrtc = handler.getNVRTCFunctionTable();
+	nvrtcResult sizeRes = nvrtc.pnvrtcGetPTXSize(prog,&_size);
 	if (sizeRes!=NVRTC_SUCCESS)
 		return {nullptr,sizeRes};
 	if (_size==0ull)
@@ -920,7 +924,7 @@ SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
 	ptxParams.size = _size;
 	auto ptx = asset::ICPUBuffer::create(std::move(ptxParams));
 	auto ptxPtr = static_cast<char*>(ptx->getPointer());
-	return {std::move(ptx),SAccess::native(handler).nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
+	return {std::move(ptx),nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
 }
 
 static const core::vector<std::string>& getDefaultRuntimeIncludeOptions()
@@ -962,7 +966,7 @@ SPTXResult compileDirectlyToPTX(
 	auto cleanup = core::makeRAIIExiter([&]() -> void
 	{
 		if (program)
-			SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program);
+			handler.getNVRTCFunctionTable().pnvrtcDestroyProgram(&program);
 	});
 
 	result = createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames);
@@ -1095,7 +1099,7 @@ CCUDAHandler::CCUDAHandler(
 
 CCUDAHandler::~CCUDAHandler() = default;
 
-uint32_t CCUDAHandler::getBuildCUDAVersion()
+uint32_t CCUDAHandler::getBuildCUDASDKVersion()
 {
 	return 0u;
 }
diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp
index cff48931c0..8ccad3e119 100644
--- a/src/nbl/video/CCUDAImportedMemory.cpp
+++ b/src/nbl/video/CCUDAImportedMemory.cpp
@@ -20,7 +20,7 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> dev
 
 cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const
 {
-  return cuda_native::SCUexternalMemory(m_native->handle).asOpaque();
+  return cuda_native::SCUexternalMemory(m_native->handle);
 }
 
 bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const
@@ -38,7 +38,7 @@ bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuff
   if (!cuda_native::defaultHandleResult(*m_device->getHandler(),result))
 	  return false;
   
-  *mappedBuffer = cuda_native::SCUdeviceptr(nativeMappedBuffer).asOpaque();
+  *mappedBuffer = cuda_native::SCUdeviceptr(nativeMappedBuffer);
   return true;
 }
 
diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp
index 6a43fefc4c..d495f979ab 100644
--- a/src/nbl/video/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/video/CCUDAImportedSemaphore.cpp
@@ -19,7 +19,7 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevic
 
 cuda_interop::SCUexternalSemaphore CCUDAImportedSemaphore::getInternalObject() const
 {
-	return cuda_native::SCUexternalSemaphore(m_native->handle).asOpaque();
+	return cuda_native::SCUexternalSemaphore(m_native->handle);
 }
 
 CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp
index 743bd10c3e..3a1500e77e 100644
--- a/src/nbl/video/CUDAInteropNativeState.hpp
+++ b/src/nbl/video/CUDAInteropNativeState.hpp
@@ -60,51 +60,6 @@ struct CCUDAImportedSemaphore::SNativeState
 	{}
 };
 
-namespace cuda_native
-{
-
-struct SAccess
-{
-	static CCUDAHandler::SNativeState& native(CCUDAHandler& handler) { return *handler.m_native; }
-	static const CCUDAHandler::SNativeState& native(const CCUDAHandler& handler) { return *handler.m_native; }
-
-	static CCUDADevice::SNativeState& native(CCUDADevice& device) { return *device.m_native; }
-	static const CCUDADevice::SNativeState& native(const CCUDADevice& device) { return *device.m_native; }
-
-	static CCUDAExportableMemory::SNativeState& native(CCUDAExportableMemory& memory) { return *memory.m_native; }
-	static const CCUDAExportableMemory::SNativeState& native(const CCUDAExportableMemory& memory) { return *memory.m_native; }
-	static std::unique_ptr<CCUDAExportableMemory::SNativeState> makeExportableMemoryNativeState()
-	{
-		return std::unique_ptr<CCUDAExportableMemory::SNativeState>(new CCUDAExportableMemory::SNativeState());
-	}
-	static CUdeviceptr& deviceptr(CCUDAExportableMemory::SNativeState& nativeState) { return nativeState.ptr; }
-	static core::smart_refctd_ptr<CCUDAExportableMemory> makeExportableMemory(core::smart_refctd_ptr<CCUDADevice> device, CCUDAExportableMemory::SCachedCreationParams&& params, std::unique_ptr<CCUDAExportableMemory::SNativeState>&& nativeState)
-	{
-		return CCUDAExportableMemory::create(std::move(device),std::move(params),std::move(nativeState));
-	}
-
-	static CCUDAImportedMemory::SNativeState& native(CCUDAImportedMemory& memory) { return *memory.m_native; }
-	static const CCUDAImportedMemory::SNativeState& native(const CCUDAImportedMemory& memory) { return *memory.m_native; }
-
-	static CCUDAImportedSemaphore::SNativeState& native(CCUDAImportedSemaphore& semaphore) { return *semaphore.m_native; }
-	static const CCUDAImportedSemaphore::SNativeState& native(const CCUDAImportedSemaphore& semaphore) { return *semaphore.m_native; }
-
-	static system::logger_opt_ptr logger(const CCUDAHandler& handler) { return handler.m_logger.get().get(); }
-	static system::logger_opt_ptr logger(const CCUDADevice& device) { return device.m_logger; }
-	static const CCUDADevice* device(const CCUDAImportedMemory& memory) { return memory.m_device.get(); }
-	static IDeviceMemoryAllocation* source(const CCUDAImportedMemory& memory) { return memory.m_src.get(); }
-	static CUmemAllocationHandleType allocationHandleType()
-	{
-	#ifdef _WIN32
-		return CU_MEM_HANDLE_TYPE_WIN32;
-	#else
-		return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
-	#endif
-	}
-};
-
-}
-
 }
 
 #endif

From ed8a1d6eb3bfbdec6641410a033ab079169f3265 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sun, 10 May 2026 12:05:03 +0200
Subject: [PATCH 32/51] Refine CUDA interop boundary

---
 CMakeLists.txt                                |   2 +-
 examples_tests                                |   2 +-
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   | 172 +-----
 include/nbl/video/CCUDADevice.h               |   2 +
 include/nbl/video/CCUDAHandler.h              |  12 +
 include/nbl/video/CUDAInteropHandles.h        |   4 -
 include/nbl/video/CUDAInteropNativeAPI.h      | 191 +++++++
 src/nbl/ext/CUDAInterop/README.md             |  14 +-
 src/nbl/video/CCUDADevice.cpp                 |  77 +--
 src/nbl/video/CCUDAExportableMemory.cpp       |   2 +-
 src/nbl/video/CCUDAHandler.cpp                | 489 ++++++++++++++----
 src/nbl/video/CCUDAImportedMemory.cpp         |  34 +-
 src/nbl/video/CCUDAImportedSemaphore.cpp      |   2 +-
 src/nbl/video/CUDAInteropNativeState.hpp      |  24 +-
 14 files changed, 678 insertions(+), 349 deletions(-)
 create mode 100644 include/nbl/video/CUDAInteropNativeAPI.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9251a3ee68..97ece5d9f8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,7 +70,7 @@ else()
 	message(STATUS "Vulkan SDK is not found")
 endif()
 
-option(NBL_COMPILE_WITH_CUDA "Build the CUDA interop extension?" OFF)
+option(NBL_COMPILE_WITH_CUDA "Build CUDA interop support?" OFF)
 set(NBL_CUDA_TOOLKIT_ROOT "" CACHE PATH "Optional CUDA Toolkit root used when NBL_COMPILE_WITH_CUDA is ON")
 
 if(NBL_COMPILE_WITH_CUDA)
diff --git a/examples_tests b/examples_tests
index e289ee14f5..d373d313d3 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit e289ee14f5b8f05004726e6f03c81a9a2e768219
+Subproject commit d373d313d3e70579d650c7804af8a2785cfede9a
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index 0e08fb2b97..538645ce3d 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -35,159 +35,10 @@
 */
 #ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_
 #define _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_
-#include <string>
-#include "nbl/video/CUDAInterop.h"
-#include "nbl/asset/ICPUBuffer.h"
-#include "nbl/system/DynamicFunctionCaller.h"
-#include "cuda.h"
-#include "nvrtc.h"
+#include "nbl/video/CUDAInteropNativeAPI.h"
 namespace nbl::video::cuda_native
 {
 
-inline constexpr int MinimumCUDADriverVersion = 13000;
-inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000;
-static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or higher.");
-
-/*
-	The CUDA/NVRTC table classes below contain the calls used and tested by Nabla's interop implementation.
-	After including this SDK opt-in header, consumer code can load extra Driver API or NVRTC symbols from the
-	same loaded libraries without changing Nabla's ABI:
-
-	auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall);
-
-	The requested symbol must be declared by the CUDA SDK headers visible to this translation unit because the
-	helper uses decltype(cuNewCall) to preserve the native function signature.
-*/
-using LibLoader = system::DefaultFuncPtrLoader;
-
-NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader
-	,cuCtxCreate_v4
-	,cuDevicePrimaryCtxRetain
-	,cuDevicePrimaryCtxRelease
-	,cuDevicePrimaryCtxSetFlags
-	,cuDevicePrimaryCtxGetState
-	,cuCtxDestroy_v2
-	,cuCtxEnablePeerAccess
-	,cuCtxGetApiVersion
-	,cuCtxGetCurrent
-	,cuCtxGetDevice
-	,cuCtxGetSharedMemConfig
-	,cuCtxPopCurrent_v2
-	,cuCtxPushCurrent_v2
-	,cuCtxSetCacheConfig
-	,cuCtxSetCurrent
-	,cuCtxSetSharedMemConfig
-	,cuCtxSynchronize
-	,cuDeviceComputeCapability
-	,cuDeviceCanAccessPeer
-	,cuDeviceGetCount
-	,cuDeviceGet
-	,cuDeviceGetAttribute
-	,cuDeviceGetLuid
-	,cuDeviceGetUuid_v2
-	,cuDeviceTotalMem_v2
-	,cuDeviceGetName
-	,cuDriverGetVersion
-	,cuEventCreate
-	,cuEventDestroy_v2
-	,cuEventElapsedTime
-	,cuEventQuery
-	,cuEventRecord
-	,cuEventSynchronize
-	,cuFuncGetAttribute
-	,cuFuncSetCacheConfig
-	,cuGetErrorName
-	,cuGetErrorString
-	,cuGraphicsMapResources
-	,cuGraphicsResourceGetMappedPointer_v2
-	,cuGraphicsResourceGetMappedMipmappedArray
-	,cuGraphicsSubResourceGetMappedArray
-	,cuGraphicsUnmapResources
-	,cuGraphicsUnregisterResource
-	,cuInit
-	,cuLaunchKernel
-	,cuMemAlloc_v2
-	,cuMemcpyDtoD_v2
-	,cuMemcpyDtoH_v2
-	,cuMemcpyHtoD_v2
-	,cuMemcpyDtoDAsync_v2
-	,cuMemcpyDtoHAsync_v2
-	,cuMemcpyHtoDAsync_v2
-	,cuMemGetAddressRange_v2
-	,cuMemFree_v2
-	,cuMemFreeHost
-	,cuMemGetInfo_v2
-	,cuMemHostAlloc
-	,cuMemHostRegister_v2
-	,cuMemHostUnregister
-	,cuMemsetD32_v2
-	,cuMemsetD32Async
-	,cuMemsetD8_v2
-	,cuMemsetD8Async
-	,cuModuleGetFunction
-	,cuModuleGetGlobal_v2
-	,cuModuleLoadDataEx
-	,cuModuleLoadFatBinary
-	,cuModuleUnload
-	,cuOccupancyMaxActiveBlocksPerMultiprocessor
-	,cuPointerGetAttribute
-	,cuStreamAddCallback
-	,cuStreamCreate
-	,cuStreamDestroy_v2
-	,cuStreamQuery
-	,cuStreamSynchronize
-	,cuStreamWaitEvent
-	,cuSurfObjectCreate
-	,cuSurfObjectDestroy
-	,cuTexObjectCreate
-	,cuTexObjectDestroy
-	,cuImportExternalMemory
-	,cuDestroyExternalMemory
-	,cuExternalMemoryGetMappedBuffer
-	,cuMemUnmap
-	,cuMemAddressFree
-	,cuMemGetAllocationGranularity
-	,cuMemAddressReserve
-	,cuMemCreate
-	,cuMemExportToShareableHandle
-	,cuMemMap
-	,cuMemRelease
-	,cuMemSetAccess
-	,cuMemImportFromShareableHandle
-	,cuLaunchHostFunc
-	,cuDestroyExternalSemaphore
-	,cuImportExternalSemaphore
-	,cuSignalExternalSemaphoresAsync
-	,cuWaitExternalSemaphoresAsync
-	,cuLogsRegisterCallback
-);
-
-NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader,
-	nvrtcGetErrorString,
-	nvrtcVersion,
-	nvrtcAddNameExpression,
-	nvrtcCompileProgram,
-	nvrtcCreateProgram,
-	nvrtcDestroyProgram,
-	nvrtcGetLoweredName,
-	nvrtcGetPTX,
-	nvrtcGetPTXSize,
-	nvrtcGetProgramLog,
-	nvrtcGetProgramLogSize
-);
-
-struct SCUDADeviceInfo
-{
-	CUdevice handle = {};
-	CUuuid uuid = {};
-};
-
-struct SPTXResult
-{
-	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
-	nvrtcResult result;
-};
-
 /*
 	CUDA SDK view of an SDK-free opaque handle.
 
@@ -215,27 +66,6 @@ inline bool isBuildCUDASDKVersionExactMatch()
 	return buildVersion==0u || buildVersion==CUDA_VERSION;
 }
 
-/*
-	Nabla interop API declarations with CUDA SDK signatures.
-
-	These declarations belong to the Nabla interop API. They live behind Nabla::ext::CUDAInterop because their
-	signatures mention CUDA/NVRTC SDK types directly. Keeping them out of nbl/video/CCUDA*.h means Nabla's public
-	API can be parsed and packaged without CUDA SDK headers. Nabla still owns the exported glue definitions.
-	Consumers accept this SDK-typed API surface only by including this header and linking the explicit interop
-	target.
-*/
-NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
-NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
-NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
-NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
-NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
-NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
-NBL_API2 SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
-NBL_API2 SPTXResult compileDirectlyToPTX(
-	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
-);
-
 }
 
 #endif
diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index 4658e51a10..56e81d4b2f 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -82,12 +82,14 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 
 		struct SNativeState;
 		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr<SNativeState>&& nativeState, core::smart_refctd_ptr<CCUDAHandler>&& handler);
+		bool isValid() const;
 
 		const system::logger_opt_ptr m_logger;
 		std::vector<const char*> m_defaultCompileOptions;
 		core::smart_refctd_ptr<CVulkanConnection> m_vulkanConnection;
 		std::array<size_t,5> m_allocationGranularity = {};
 		E_VIRTUAL_ARCHITECTURE m_virtualArchitecture;
+		bool m_valid = false;
 
 		core::smart_refctd_ptr<CCUDAHandler> m_handler;
 		std::unique_ptr<SNativeState> m_native;
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index a77ab66b68..e69792b217 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -31,9 +31,18 @@ namespace cuda_interop
 {
 inline constexpr const char* RuntimePathsFileName = "nbl_cuda_interop_runtime.json";
 
+struct SRuntimeIncludeDir
+{
+	system::path path;
+	std::string source;
+	uint32_t cudaVersion = 0u;
+	bool completeRuntimeHeaderSet = false;
+};
+
 struct SRuntimeCompileEnvironment
 {
 	core::vector<system::path> includeDirs;
+	core::vector<SRuntimeIncludeDir> includeDirInfos;
 };
 
 NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs = {});
@@ -52,8 +61,11 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 	public:
 		static core::smart_refctd_ptr<CCUDAHandler> create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger);
 		static uint32_t getBuildCUDASDKVersion();
+		uint32_t getLoadedCUDADriverVersion() const;
+		std::array<int,2> getLoadedNVRTCVersion() const;
 		const cuda_native::CUDA& getCUDAFunctionTable() const;
 		const cuda_native::NVRTC& getNVRTCFunctionTable() const;
+		core::SRange<const char* const> getDefaultRuntimeIncludeOptions() const;
 		inline system::logger_opt_ptr getLogger() const { return m_logger.getOptRawPtr(); }
 
 		inline core::SRange<system::IFile* const> getSTDHeaders()
diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h
index 92888d3ccf..987a130ad1 100644
--- a/include/nbl/video/CUDAInteropHandles.h
+++ b/include/nbl/video/CUDAInteropHandles.h
@@ -65,10 +65,6 @@ struct SNativeHandle
 	operator Opaque&() { return reinterpret_cast<Opaque&>(value); }
 	operator const Opaque&() const { return reinterpret_cast<const Opaque&>(value); }
 
-	Opaque* opaque() { return &static_cast<Opaque&>(*this); }
-	const Opaque* opaque() const { return &static_cast<const Opaque&>(*this); }
-	Opaque asOpaque() const { return static_cast<const Opaque&>(*this); }
-
 	cuda_t value = {};
 };
 
diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h
new file mode 100644
index 0000000000..52dad41f09
--- /dev/null
+++ b/include/nbl/video/CUDAInteropNativeAPI.h
@@ -0,0 +1,191 @@
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_API_H_INCLUDED_
+#define _NBL_VIDEO_CUDA_INTEROP_NATIVE_API_H_INCLUDED_
+
+#include <string>
+
+#include "nbl/video/CUDAInterop.h"
+#include "nbl/asset/ICPUBuffer.h"
+#include "nbl/system/DynamicFunctionCaller.h"
+
+#include "cuda.h"
+#include "nvrtc.h"
+
+namespace nbl::video::cuda_native
+{
+
+inline constexpr int MinimumCUDADriverVersion = 13000;
+inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000;
+static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or higher.");
+
+/*
+	Low-level CUDA SDK boundary shared by Nabla's CUDA implementation and explicit CUDA interop opt-in users.
+
+	This file lives under include/ because it is shared with nbl/ext/CUDAInterop/CUDAInteropNative.h, the public
+	opt-in header for consumers that explicitly accept CUDA SDK types. Its physical location does not make it part
+	of the default Nabla public interface: nbl/video/CCUDA*.h headers, Nabla::Nabla public requirements, and PCH
+	do not include it, so normal Nabla consumers do not need cuda.h or nvrtc.h.
+
+	The declarations below intentionally use CUDA/NVRTC SDK types because they describe the SDK-typed glue between
+	raw CUDA code and Nabla's exported CUDA interop objects: dynamic function tables, NVRTC helpers, error handling,
+	and runtime header discovery integration. Consumers enter this surface only by linking Nabla::ext::CUDAInterop
+	and including nbl/ext/CUDAInterop/CUDAInteropNative.h.
+*/
+using LibLoader = system::DefaultFuncPtrLoader;
+
+/*
+	The CUDA/NVRTC table classes contain the calls used and tested by Nabla's interop implementation. SDK opt-in
+	consumers can load additional Driver API or NVRTC symbols from the same table without changing Nabla's ABI:
+
+	auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall);
+
+	The requested symbol must be declared by the CUDA SDK visible to that translation unit because the helper uses
+	decltype(cuNewCall) to preserve the native function signature.
+*/
+NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader
+	,cuCtxCreate_v4
+	,cuDevicePrimaryCtxRetain
+	,cuDevicePrimaryCtxRelease
+	,cuDevicePrimaryCtxSetFlags
+	,cuDevicePrimaryCtxGetState
+	,cuCtxDestroy_v2
+	,cuCtxEnablePeerAccess
+	,cuCtxGetApiVersion
+	,cuCtxGetCurrent
+	,cuCtxGetDevice
+	,cuCtxGetSharedMemConfig
+	,cuCtxPopCurrent_v2
+	,cuCtxPushCurrent_v2
+	,cuCtxSetCacheConfig
+	,cuCtxSetCurrent
+	,cuCtxSetSharedMemConfig
+	,cuCtxSynchronize
+	,cuDeviceComputeCapability
+	,cuDeviceCanAccessPeer
+	,cuDeviceGetCount
+	,cuDeviceGet
+	,cuDeviceGetAttribute
+	,cuDeviceGetLuid
+	,cuDeviceGetUuid_v2
+	,cuDeviceTotalMem_v2
+	,cuDeviceGetName
+	,cuDriverGetVersion
+	,cuEventCreate
+	,cuEventDestroy_v2
+	,cuEventElapsedTime
+	,cuEventQuery
+	,cuEventRecord
+	,cuEventSynchronize
+	,cuFuncGetAttribute
+	,cuFuncSetCacheConfig
+	,cuGetErrorName
+	,cuGetErrorString
+	,cuGraphicsMapResources
+	,cuGraphicsResourceGetMappedPointer_v2
+	,cuGraphicsResourceGetMappedMipmappedArray
+	,cuGraphicsSubResourceGetMappedArray
+	,cuGraphicsUnmapResources
+	,cuGraphicsUnregisterResource
+	,cuInit
+	,cuLaunchKernel
+	,cuMemAlloc_v2
+	,cuMemcpyDtoD_v2
+	,cuMemcpyDtoH_v2
+	,cuMemcpyHtoD_v2
+	,cuMemcpyDtoDAsync_v2
+	,cuMemcpyDtoHAsync_v2
+	,cuMemcpyHtoDAsync_v2
+	,cuMemGetAddressRange_v2
+	,cuMemFree_v2
+	,cuMemFreeHost
+	,cuMemGetInfo_v2
+	,cuMemHostAlloc
+	,cuMemHostRegister_v2
+	,cuMemHostUnregister
+	,cuMemsetD32_v2
+	,cuMemsetD32Async
+	,cuMemsetD8_v2
+	,cuMemsetD8Async
+	,cuModuleGetFunction
+	,cuModuleGetGlobal_v2
+	,cuModuleLoadDataEx
+	,cuModuleLoadFatBinary
+	,cuModuleUnload
+	,cuOccupancyMaxActiveBlocksPerMultiprocessor
+	,cuPointerGetAttribute
+	,cuStreamAddCallback
+	,cuStreamCreate
+	,cuStreamDestroy_v2
+	,cuStreamQuery
+	,cuStreamSynchronize
+	,cuStreamWaitEvent
+	,cuSurfObjectCreate
+	,cuSurfObjectDestroy
+	,cuTexObjectCreate
+	,cuTexObjectDestroy
+	,cuImportExternalMemory
+	,cuDestroyExternalMemory
+	,cuExternalMemoryGetMappedBuffer
+	,cuMemUnmap
+	,cuMemAddressFree
+	,cuMemGetAllocationGranularity
+	,cuMemAddressReserve
+	,cuMemCreate
+	,cuMemExportToShareableHandle
+	,cuMemMap
+	,cuMemRelease
+	,cuMemSetAccess
+	,cuMemImportFromShareableHandle
+	,cuLaunchHostFunc
+	,cuDestroyExternalSemaphore
+	,cuImportExternalSemaphore
+	,cuSignalExternalSemaphoresAsync
+	,cuWaitExternalSemaphoresAsync
+	,cuLogsRegisterCallback
+);
+
+NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader,
+	nvrtcGetErrorString,
+	nvrtcVersion,
+	nvrtcAddNameExpression,
+	nvrtcCompileProgram,
+	nvrtcCreateProgram,
+	nvrtcDestroyProgram,
+	nvrtcGetLoweredName,
+	nvrtcGetPTX,
+	nvrtcGetPTXSize,
+	nvrtcGetProgramLog,
+	nvrtcGetProgramLogSize
+);
+
+struct SPTXResult
+{
+	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
+	nvrtcResult result;
+};
+
+/*
+	Exported Nabla glue declarations with CUDA SDK signatures.
+
+	These are not a CUDA wrapper. They are the small boundary surface used for error handling, NVRTC helpers,
+	runtime header discovery integration, and dynamic CUDA/NVRTC table access. Nabla owns the definitions.
+	The signatures mention CUDA SDK types, so they are intentionally unavailable to consumers that only parse
+	SDK-free nbl/video/CCUDA*.h headers.
+*/
+NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
+NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
+NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
+NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
+NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
+NBL_API2 SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
+NBL_API2 SPTXResult compileDirectlyToPTX(
+	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+	std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
+);
+
+}
+
+#endif
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 231658e949..5677db046f 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -6,7 +6,8 @@
 - The public Nabla headers do not include `cuda.h`, `nvrtc.h`, or other CUDA SDK headers. A consumer that only links `Nabla::Nabla` does not need a CUDA SDK install just to parse Nabla headers.
 - CUDA native state is stored behind incomplete `SNativeState` members in Nabla classes. Public headers expose fixed-layout opaque value handles from `nbl/video/CUDAInteropHandles.h`.
 - `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the SDK opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop.
-- `CUDAInteropNative.h` is the opt-in SDK boundary. It includes CUDA SDK headers and aliases Nabla opaque handles to CUDA SDK types through `cuda_interop::SNativeHandle`.
+- `nbl/video/CUDAInteropNativeAPI.h` is the low-level SDK boundary used by Nabla's CUDA implementation and by opt-in consumers. It declares the dynamic CUDA/NVRTC tables and exported Nabla glue functions whose signatures use CUDA SDK types.
+- `nbl/ext/CUDAInterop/CUDAInteropNative.h` is the public opt-in entrypoint. It includes the native API header and aliases Nabla opaque handles to CUDA SDK types through `cuda_interop::SNativeHandle`.
 
 ## CMake Usage
 
@@ -89,7 +90,7 @@ auto compile = nbl::video::cuda_native::compileDirectlyToPTX(
 SDK opt-in access is not a full CUDA wrapper. It is the glue between Nabla resource lifetime and raw CUDA interop:
 
 - `CCUDAHandler::getCUDAFunctionTable` and `CCUDAHandler::getNVRTCFunctionTable` expose the loaded Driver API and NVRTC tables after SDK opt-in.
-- The default tables contain the CUDA/NVRTC calls used and tested by Nabla. SDK opt-in code can load extra symbols from the same dynamic table without changing Nabla's ABI. The symbol name must be declared by the CUDA SDK headers visible to that translation unit:
+- The shipped tables contain the CUDA/NVRTC calls used and tested by Nabla. SDK opt-in code can load extra symbols from the same dynamic table without changing Nabla's ABI. The symbol name must be declared by the CUDA SDK headers visible to that translation unit:
 
 ```cpp
 auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall);
@@ -115,7 +116,7 @@ Smoke examples:
 - Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes.
 - Opaque handle types are small trivially-copyable byte arrays with fixed size/alignment chosen to match CUDA SDK handle storage. The SDK opt-in header validates this with `static_assert`s against the SDK used by the consumer.
 - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state.
-- SDK-sized arrays, CUDA enum storage, and CUDA implementation headers stay private to Nabla.
+- SDK-sized arrays, CUDA enum storage, and CUDA implementation state stay private to Nabla.
 - A consumer can build SDK opt-in code with its own compatible SDK independently from the SDK used to build Nabla. SDK-typed code can check `cuda_native::isBuildCUDASDKVersionExactMatch()` when exact CUDA SDK version matching is required.
 - Runtime include-option construction is header-only and is not part of the exported ABI.
 - The loaded CUDA driver and NVRTC runtime are validated at runtime.
@@ -129,8 +130,11 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud
 - Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`.
 - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application.
 - Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots.
+- Runtime lookup records the source of every accepted include root and parses `CUDA_VERSION` from `cuda.h` when available. The startup report prints the primary include root, its source, its parsed CUDA version, and the full search order.
+- The first include root is not required to match the SDK used to build Nabla. It is the first `-I` path visible to NVRTC, so the first path containing a requested header wins just like normal C/C++ include search.
+- If the primary runtime header root is incomplete or reports a different CUDA version than the loaded NVRTC runtime, Nabla logs a warning. This is diagnostic policy, not an automatic hard failure.
 - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths.
-- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Discovery is cached after the first call.
+- `CCUDAHandler` captures discovered include directories when it is created. `cuda_native::compileDirectlyToPTX` reuses those exact include options, so the startup report matches the NVRTC search paths used by that handler.
 
 Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit.
 
@@ -157,7 +161,7 @@ CuPy documents the same NVRTC issue for CUDA 12.2+. Their install docs say: "On
 ## CUDA ON/OFF Builds
 
 - SDK-free public headers stay stable for CUDA ON and CUDA OFF Nabla builds.
-- CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`.
+- Nabla implementation `.cpp` files include CUDA SDK headers only behind `_NBL_COMPILE_WITH_CUDA_`.
 - CUDA OFF implementations are local stubs in the same `.cpp` files. Factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols.
 - The Nabla source list stays stable, so CUDA interop `.cpp` files remain visible in IDE projects for both CUDA ON and CUDA OFF builds.
 
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index 1c73068a6d..0178f31fc7 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -67,15 +67,15 @@ static CUmemAllocationHandleType getAllocationHandleType()
 }
 
 CCUDADevice::CCUDADevice(
-	core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, 
-	IPhysicalDevice* const vulkanDevice, 
+	core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection,
+	IPhysicalDevice* const vulkanDevice,
 	const E_VIRTUAL_ARCHITECTURE virtualArchitecture,
 	std::unique_ptr<SNativeState>&& nativeState,
-	core::smart_refctd_ptr<CCUDAHandler>&& handler) : 
+	core::smart_refctd_ptr<CCUDAHandler>&& handler) :
 	m_logger(vulkanDevice->getDebugCallback()->getLogger()),
-  m_defaultCompileOptions(), 
-  m_vulkanConnection(std::move(vulkanConnection)), 
-  m_virtualArchitecture(virtualArchitecture),
+	m_defaultCompileOptions(),
+	m_vulkanConnection(std::move(vulkanConnection)),
+	m_virtualArchitecture(virtualArchitecture),
 	m_handler(std::move(handler)),
 	m_native(std::move(nativeState))
 {
@@ -86,43 +86,43 @@ CCUDADevice::CCUDADevice(
 	m_defaultCompileOptions.push_back("-dc");
 	m_defaultCompileOptions.push_back("-use_fast_math");
 
-  const auto& cu = m_handler->getCUDAFunctionTable();
+	const auto& cu = m_handler->getCUDAFunctionTable();
 	
 	if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle)))
-		assert(false);
+		return;
 	if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context)))
-		assert(false);
+		return;
 
 	for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType)
 	{
-	
-    #ifdef _WIN32
-      OBJECT_ATTRIBUTES metadata = {
-        .Length = sizeof(OBJECT_ATTRIBUTES)
-      };
-    #endif
-
-	  const auto prop = CUmemAllocationProp{
-      .type = CU_MEM_ALLOCATION_TYPE_PINNED,
-      .requestedHandleTypes = getAllocationHandleType(),
-      .location = { .type = static_cast<CUmemLocationType>(locationType), .id = m_native->handle },
-  #ifdef _WIN32
-      .win32HandleMetaData = &metadata,
-  #endif
-    };
+#ifdef _WIN32
+		OBJECT_ATTRIBUTES metadata = {
+			.Length = sizeof(OBJECT_ATTRIBUTES)
+		};
+#endif
+
+		const auto prop = CUmemAllocationProp{
+			.type = CU_MEM_ALLOCATION_TYPE_PINNED,
+			.requestedHandleTypes = getAllocationHandleType(),
+			.location = { .type = static_cast<CUmemLocationType>(locationType), .id = m_native->handle },
+#ifdef _WIN32
+			.win32HandleMetaData = &metadata,
+#endif
+		};
 		if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)))
-			assert(false);
+			return;
 	}
+	m_valid = true;
 }
 
 cuda_interop::SCUdevice CCUDADevice::getInternalObject() const
 {
-	return cuda_native::SCUdevice(m_native->handle);
+	return cuda_interop::SNativeHandle<cuda_interop::SCUdevice,CUdevice>(m_native->handle);
 }
 
 cuda_interop::SCUcontext CCUDADevice::getContext() const
 {
-	return cuda_native::SCUcontext(m_native->context);
+	return cuda_interop::SNativeHandle<cuda_interop::SCUcontext,CUcontext>(m_native->context);
 }
 
 static bool isDeviceLocal(CUmemLocationType location)
@@ -176,14 +176,14 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 		return nullptr;
 
 	auto& cu = handler->getCUDAFunctionTable();
-	
+
 #ifdef _WIN32
 	OBJECT_ATTRIBUTES metadata = {
-	  .Length = sizeof(OBJECT_ATTRIBUTES)
+		.Length = sizeof(OBJECT_ATTRIBUTES)
 	};
 #endif
 
-	 const auto prop = CUmemAllocationProp{
+	const auto prop = CUmemAllocationProp{
 		.type = CU_MEM_ALLOCATION_TYPE_PINNED,
 		.requestedHandleTypes = getAllocationHandleType(),
 		.location = { .type = location, .id = m_native->handle },
@@ -275,10 +275,9 @@ core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaph
 	CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = {
 #ifdef _WIN32
 		.type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32,
-		// TODO(kevinyu): Fix this later. Make it compile first.
 		.handle = {.win32 = {.handle = sema->getExternalHandle() }},
 #else
-    .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
+		.type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
 		.handle = {.fd = sema->getExternalHandle()}
 #endif
 	};
@@ -299,7 +298,13 @@ core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaph
 
 CCUDADevice::~CCUDADevice()
 {
-	cuda_native::defaultHandleResult(*m_handler, m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context));
+	if (m_native->context)
+		cuda_native::defaultHandleResult(*m_handler, m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context));
+}
+
+bool CCUDADevice::isValid() const
+{
+	return m_valid;
 }
 
 }
@@ -321,6 +326,7 @@ CCUDADevice::CCUDADevice(
 	: m_logger(nullptr)
 	, m_vulkanConnection(std::move(vulkanConnection))
 	, m_virtualArchitecture(virtualArchitecture)
+	, m_valid(false)
 	, m_handler(std::move(handler))
 	, m_native(std::move(nativeState))
 {
@@ -329,6 +335,11 @@ CCUDADevice::CCUDADevice(
 
 CCUDADevice::~CCUDADevice() = default;
 
+bool CCUDADevice::isValid() const
+{
+	return false;
+}
+
 cuda_interop::SCUdevice CCUDADevice::getInternalObject() const
 {
 	return {};
diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp
index 929453b3bd..722c958b68 100644
--- a/src/nbl/video/CCUDAExportableMemory.cpp
+++ b/src/nbl/video/CCUDAExportableMemory.cpp
@@ -65,7 +65,7 @@ CCUDAExportableMemory::~CCUDAExportableMemory()
 
 cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const
 {
-	return cuda_native::SCUdeviceptr(m_native->ptr);
+	return cuda_interop::SNativeHandle<cuda_interop::SCUdeviceptr,CUdeviceptr>(m_native->ptr);
 }
 
 }
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index ce9a8aa46b..6d8b2ffb70 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -9,6 +9,7 @@
 #include <algorithm>
 #include <cstdlib>
 #include <fstream>
+#include <sstream>
 #include <system_error>
 
 namespace nbl::video::cuda_interop
@@ -62,28 +63,71 @@ bool looksLikeCUDAIncludeDir(const system::path& path)
 		isRegularFile(path/"nv"/"target");
 }
 
-void appendIncludeDir(core::vector<system::path>& includeDirs, system::path path)
+uint32_t readCUDAVersion(const system::path& includeDir)
+{
+	std::ifstream input(includeDir/"cuda.h");
+	if (!input)
+		return 0u;
+
+	std::string line;
+	while (std::getline(input,line))
+	{
+		std::istringstream stream(line);
+		std::string directive;
+		stream >> directive;
+		if (directive!="#define")
+			continue;
+
+		std::string name;
+		stream >> name;
+		if (name!="CUDA_VERSION")
+			continue;
+
+		uint32_t version = 0u;
+		if (stream >> version)
+			return version;
+	}
+	return 0u;
+}
+
+bool looksLikeCompleteRuntimeHeaderSet(const system::path& includeDir)
+{
+	return isRegularFile(includeDir/"cuda.h") &&
+		isRegularFile(includeDir/"cuda_runtime_api.h") &&
+		isRegularFile(includeDir/"vector_types.h");
+}
+
+void appendIncludeDir(SRuntimeCompileEnvironment& environment, system::path path, std::string source)
 {
 	if (path.empty() || !looksLikeCUDAIncludeDir(path))
 		return;
 
 	path = normalizedAbsolute(std::move(path));
 	const auto pathString = path.generic_string();
-	const auto alreadyAdded = std::find_if(includeDirs.begin(),includeDirs.end(),[&](const system::path& existing) {
+	const auto alreadyAdded = std::find_if(environment.includeDirs.begin(),environment.includeDirs.end(),[&](const system::path& existing) {
 		return existing.generic_string()==pathString;
 	});
-	if (alreadyAdded==includeDirs.end())
-		includeDirs.push_back(std::move(path));
+	if (alreadyAdded==environment.includeDirs.end())
+	{
+		SRuntimeIncludeDir info;
+		info.path = path;
+		info.source = std::move(source);
+		info.cudaVersion = readCUDAVersion(path);
+		info.completeRuntimeHeaderSet = looksLikeCompleteRuntimeHeaderSet(path);
+
+		environment.includeDirs.push_back(std::move(path));
+		environment.includeDirInfos.push_back(std::move(info));
+	}
 }
 
-void appendCUDAIncludeDirsBelow(core::vector<system::path>& includeDirs, const system::path& root, uint32_t maxDepth)
+void appendCUDAIncludeDirsBelow(SRuntimeCompileEnvironment& environment, const system::path& root, uint32_t maxDepth, std::string source)
 {
 	if (!isDirectory(root))
 		return;
 
 	if (looksLikeCUDAIncludeDir(root))
 	{
-		appendIncludeDir(includeDirs,root);
+		appendIncludeDir(environment,root,std::move(source));
 		return;
 	}
 	if (maxDepth==0u)
@@ -106,19 +150,19 @@ void appendCUDAIncludeDirsBelow(core::vector<system::path>& includeDirs, const s
 		return lhs.generic_string()>rhs.generic_string();
 	});
 	for (const auto& candidate : candidates)
-		appendCUDAIncludeDirsBelow(includeDirs,candidate,maxDepth-1u);
+		appendCUDAIncludeDirsBelow(environment,candidate,maxDepth-1u,source);
 }
 
-void appendCUDAIncludeRoot(core::vector<system::path>& includeDirs, const system::path& root)
+void appendCUDAIncludeRoot(SRuntimeCompileEnvironment& environment, const system::path& root, std::string source)
 {
 	if (root.empty())
 		return;
 
-	appendIncludeDir(includeDirs,root);
-	appendIncludeDir(includeDirs,root/"include");
+	appendIncludeDir(environment,root,source);
+	appendIncludeDir(environment,root/"include",std::move(source));
 }
 
-void appendRuntimePathsConfig(core::vector<system::path>& includeDirs, const system::path& configFile)
+void appendRuntimePathsConfig(SRuntimeCompileEnvironment& environment, const system::path& configFile, const char* source)
 {
 	if (!isRegularFile(configFile))
 		return;
@@ -137,7 +181,7 @@ void appendRuntimePathsConfig(core::vector<system::path>& includeDirs, const sys
 
 	for (const auto& path : *paths)
 		if (path.is_string())
-			appendIncludeDir(includeDirs,system::path(path.get<std::string>()));
+			appendIncludeDir(environment,system::path(path.get<std::string>()),std::string(source)+": "+configFile.generic_string());
 }
 
 template<typename Append>
@@ -160,60 +204,66 @@ void appendPathListEnv(const char* name, Append append)
 	}
 }
 
-void appendRuntimePathsConfigs(core::vector<system::path>& includeDirs, const core::vector<system::path>& explicitRuntimePathFiles)
+void appendRuntimePathsConfigs(SRuntimeCompileEnvironment& environment, const core::vector<system::path>& explicitRuntimePathFiles)
 {
 	for (const auto& runtimePathFile : explicitRuntimePathFiles)
-		appendRuntimePathsConfig(includeDirs,runtimePathFile);
+		appendRuntimePathsConfig(environment,runtimePathFile,"explicit runtime JSON");
 
-	const auto appendConfig = [&](const system::path& path) { appendRuntimePathsConfig(includeDirs,path); };
-	appendPathListEnv("NBL_CUDA_INTEROP_RUNTIME_JSON",appendConfig);
-	appendPathListEnv("Nabla_CUDA_INTEROP_RUNTIME_JSON",appendConfig);
+	appendPathListEnv("NBL_CUDA_INTEROP_RUNTIME_JSON",[&](const system::path& path) {
+		appendRuntimePathsConfig(environment,path,"NBL_CUDA_INTEROP_RUNTIME_JSON");
+	});
+	appendPathListEnv("Nabla_CUDA_INTEROP_RUNTIME_JSON",[&](const system::path& path) {
+		appendRuntimePathsConfig(environment,path,"Nabla_CUDA_INTEROP_RUNTIME_JSON");
+	});
 
 	const auto exeDir = system::executableDirectory();
 	if (!exeDir.empty())
-		appendRuntimePathsConfig(includeDirs,exeDir/RuntimePathsFileName);
+		appendRuntimePathsConfig(environment,exeDir/RuntimePathsFileName,"executable-local runtime JSON");
 }
 
-void appendAppLocalIncludeDirs(core::vector<system::path>& includeDirs)
+void appendAppLocalIncludeDirs(SRuntimeCompileEnvironment& environment)
 {
 	const auto exeDir = system::executableDirectory();
 	if (exeDir.empty())
 		return;
 
-	appendIncludeDir(includeDirs,exeDir/"cuda"/"include");
-	appendCUDAIncludeDirsBelow(includeDirs,exeDir/"nvidia",4u);
-	appendIncludeDir(includeDirs,exeDir/"Libraries"/"cuda"/"include");
-	appendIncludeDir(includeDirs,exeDir.parent_path()/"cuda"/"include");
-	appendCUDAIncludeDirsBelow(includeDirs,exeDir.parent_path()/"nvidia",4u);
+	appendIncludeDir(environment,exeDir/"cuda"/"include","app-local cuda/include");
+	appendCUDAIncludeDirsBelow(environment,exeDir/"nvidia",4u,"app-local nvidia package");
+	appendIncludeDir(environment,exeDir/"Libraries"/"cuda"/"include","app-local Libraries/cuda/include");
+	appendIncludeDir(environment,exeDir.parent_path()/"cuda"/"include","parent app-local cuda/include");
+	appendCUDAIncludeDirsBelow(environment,exeDir.parent_path()/"nvidia",4u,"parent app-local nvidia package");
 }
 
-void appendPythonPackageIncludeDirs(core::vector<system::path>& includeDirs, const system::path& root)
+void appendPythonPackageIncludeDirs(SRuntimeCompileEnvironment& environment, const system::path& root, const char* source)
 {
 	if (root.empty())
 		return;
 
-	appendCUDAIncludeDirsBelow(includeDirs,root/"Lib"/"site-packages"/"nvidia",4u);
-	appendCUDAIncludeDirsBelow(includeDirs,root/"lib"/"site-packages"/"nvidia",4u);
-	appendIncludeDir(includeDirs,root/"Library"/"include");
-	appendIncludeDir(includeDirs,root/"include");
+	appendCUDAIncludeDirsBelow(environment,root/"Lib"/"site-packages"/"nvidia",4u,std::string(source)+" Python nvidia package");
+	appendCUDAIncludeDirsBelow(environment,root/"lib"/"site-packages"/"nvidia",4u,std::string(source)+" Python nvidia package");
+	appendIncludeDir(environment,root/"Library"/"include",std::string(source)+" Library/include");
+	appendIncludeDir(environment,root/"include",std::string(source)+" include");
 }
 
-void appendEnvironmentIncludeDirs(core::vector<system::path>& includeDirs)
+void appendEnvironmentIncludeDirs(SRuntimeCompileEnvironment& environment)
 {
-	const auto appendInclude = [&](const system::path& path) { appendIncludeDir(includeDirs,path); };
-	appendPathListEnv("NBL_CUDA_RUNTIME_INCLUDE_DIRS",appendInclude);
-	appendPathListEnv("Nabla_CUDA_RUNTIME_INCLUDE_DIRS",appendInclude);
+	appendPathListEnv("NBL_CUDA_RUNTIME_INCLUDE_DIRS",[&](const system::path& path) {
+		appendIncludeDir(environment,path,"NBL_CUDA_RUNTIME_INCLUDE_DIRS");
+	});
+	appendPathListEnv("Nabla_CUDA_RUNTIME_INCLUDE_DIRS",[&](const system::path& path) {
+		appendIncludeDir(environment,path,"Nabla_CUDA_RUNTIME_INCLUDE_DIRS");
+	});
 
-	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_PATH"));
-	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_HOME"));
-	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_ROOT"));
-	appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDAToolkit_ROOT"));
+	appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDA_PATH"),"CUDA_PATH");
+	appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDA_HOME"),"CUDA_HOME");
+	appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDA_ROOT"),"CUDA_ROOT");
+	appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDAToolkit_ROOT"),"CUDAToolkit_ROOT");
 
-	appendPythonPackageIncludeDirs(includeDirs,readEnvironmentVariable("VIRTUAL_ENV"));
-	appendPythonPackageIncludeDirs(includeDirs,readEnvironmentVariable("CONDA_PREFIX"));
+	appendPythonPackageIncludeDirs(environment,readEnvironmentVariable("VIRTUAL_ENV"),"VIRTUAL_ENV");
+	appendPythonPackageIncludeDirs(environment,readEnvironmentVariable("CONDA_PREFIX"),"CONDA_PREFIX");
 }
 
-void appendCUDAInstallRoots(core::vector<system::path>& includeDirs, const system::path& root)
+void appendCUDAInstallRoots(SRuntimeCompileEnvironment& environment, const system::path& root, const char* source)
 {
 	if (!isDirectory(root))
 		return;
@@ -233,17 +283,17 @@ void appendCUDAInstallRoots(core::vector<system::path>& includeDirs, const syste
 		return lhs.generic_string()>rhs.generic_string();
 	});
 	for (const auto& candidate : candidates)
-		appendIncludeDir(includeDirs,candidate);
+		appendIncludeDir(environment,candidate,source);
 }
 
-void appendSystemIncludeDirs(core::vector<system::path>& includeDirs)
+void appendSystemIncludeDirs(SRuntimeCompileEnvironment& environment)
 {
 	#if defined(_NBL_PLATFORM_WINDOWS_)
-	appendCUDAInstallRoots(includeDirs,"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA");
+	appendCUDAInstallRoots(environment,"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA","system CUDA Toolkit install root");
 	#else
-	appendIncludeDir(includeDirs,"/usr/local/cuda/include");
-	appendCUDAInstallRoots(includeDirs,"/usr/local");
-	appendIncludeDir(includeDirs,"/usr/include");
+	appendIncludeDir(environment,"/usr/local/cuda/include","system /usr/local/cuda");
+	appendCUDAInstallRoots(environment,"/usr/local","system /usr/local CUDA install root");
+	appendIncludeDir(environment,"/usr/include","system /usr/include");
 	#endif
 }
 
@@ -252,13 +302,25 @@ void appendSystemIncludeDirs(core::vector<system::path>& includeDirs)
 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs, core::vector<system::path> runtimePathFiles)
 {
 	SRuntimeCompileEnvironment environment;
+
+	/*
+		Runtime header discovery builds the ordered include list passed to NVRTC. It is not a lock to the CUDA SDK
+		used to build Nabla. A packaged Nabla must stay relocatable, so host-specific include paths are accepted
+		only when the application provides them at runtime: direct arguments, JSON next to the executable, an
+		override JSON, app-local header bundles, environment variables, or finally common toolkit install roots.
+
+		The first root containing a requested header wins exactly like normal C/C++ include search. Keep every
+		accepted root with its source and parsed CUDA_VERSION so startup logs can explain what NVRTC will see.
+		This is also why mismatched or partial roots produce diagnostics instead of changing discovery order or
+		hard-failing before the user kernel is compiled.
+	*/
 	for (auto& includeDir : explicitIncludeDirs)
-		appendIncludeDir(environment.includeDirs,std::move(includeDir));
+		appendIncludeDir(environment,std::move(includeDir),"explicit include dir");
 
-	appendRuntimePathsConfigs(environment.includeDirs,runtimePathFiles);
-	appendAppLocalIncludeDirs(environment.includeDirs);
-	appendEnvironmentIncludeDirs(environment.includeDirs);
-	appendSystemIncludeDirs(environment.includeDirs);
+	appendRuntimePathsConfigs(environment,runtimePathFiles);
+	appendAppLocalIncludeDirs(environment);
+	appendEnvironmentIncludeDirs(environment);
+	appendSystemIncludeDirs(environment);
 
 	return environment;
 }
@@ -292,6 +354,82 @@ int cudaVersionMinor(int version)
 	return (version%1000)/10;
 }
 
+int cudaVersionCode(int major, int minor)
+{
+	return major*1000+minor*10;
+}
+
+system::path loadedRuntimeModulePath(const char* moduleName)
+{
+	#if defined(_NBL_PLATFORM_WINDOWS_)
+	const auto moduleDir = system::loadedModuleDirectory(moduleName);
+	if (moduleDir.empty())
+		return {};
+	return moduleDir/(std::string(moduleName)+".dll");
+	#else
+	return {};
+	#endif
+}
+
+std::string cudaVersionString(int version)
+{
+	std::ostringstream stream;
+	stream << cudaVersionMajor(version) << "." << cudaVersionMinor(version);
+	return stream.str();
+}
+
+std::string cudaVersionString(const std::array<int,2>& version)
+{
+	std::ostringstream stream;
+	stream << version[0] << "." << version[1];
+	return stream.str();
+}
+
+std::string runtimeIncludeDirDescription(const cuda_interop::SRuntimeIncludeDir& includeDir)
+{
+	std::ostringstream stream;
+	stream << includeDir.path.generic_string() << " (" << includeDir.source;
+	if (includeDir.cudaVersion!=0u)
+		stream << ", CUDA_VERSION " << includeDir.cudaVersion << " / " << cudaVersionString(includeDir.cudaVersion);
+	else
+		stream << ", CUDA_VERSION unknown";
+	if (!includeDir.completeRuntimeHeaderSet)
+		stream << ", partial header root";
+	stream << ")";
+	return stream.str();
+}
+
+std::string cudaRuntimeReport(
+	const int buildVersion, const int cudaDriverVersion, const system::path& cudaDriverPath,
+	const std::array<int,2>& nvrtcVersion, const std::string& nvrtcLibraryName, const system::path& nvrtcPath,
+	const cuda_interop::SRuntimeCompileEnvironment& runtimeEnvironment)
+{
+	std::ostringstream stream;
+	stream << "CCUDAHandler: CUDA interop runtime report:\n";
+	stream << "  - Nabla build CUDA SDK: " << cudaVersionString(buildVersion) << "\n";
+	stream << "  - CUDA Driver API: " << cudaVersionString(cudaDriverVersion);
+	if (!cudaDriverPath.empty())
+		stream << " (" << cudaDriverPath.generic_string() << ")";
+	stream << "\n";
+	stream << "  - NVRTC runtime: " << cudaVersionString(nvrtcVersion) << " (" << nvrtcLibraryName;
+	if (!nvrtcPath.empty())
+		stream << ", " << nvrtcPath.generic_string();
+	stream << ")\n";
+
+	if (runtimeEnvironment.includeDirs.empty())
+	{
+		stream << "  - NVRTC runtime header search path: none discovered";
+	}
+	else
+	{
+		stream << "  - Primary NVRTC runtime header path: " << runtimeIncludeDirDescription(runtimeEnvironment.includeDirInfos.front()) << "\n";
+		stream << "  - NVRTC runtime header search order (first path containing the requested header wins):\n";
+		for (const auto& includeDir : runtimeEnvironment.includeDirInfos)
+			stream << "    - " << runtimeIncludeDirDescription(includeDir) << "\n";
+	}
+	return stream.str();
+}
+
 }
 	
 CCUDAHandler::CCUDAHandler(
@@ -310,6 +448,8 @@ CCUDAHandler::CCUDAHandler(
 		m_headerNamesStorage.push_back(header->getFileName().string());
 		m_headerNames.push_back(m_headerNamesStorage.back().c_str());
 	}
+	for (const auto& option : m_native->runtimeIncludeOptions)
+		m_native->runtimeIncludeOptionPtrs.push_back(option.c_str());
 
 	int deviceCount = 0;
 	if (m_native->cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0)
@@ -326,9 +466,8 @@ CCUDAHandler::CCUDAHandler(
 			continue;
 
 		auto& nativeDevice = m_native->deviceStates.emplace_back();
-		nativeDevice.info.handle = handle;
-		nativeDevice.info.uuid = uuid;
-		m_native->availableDevices.push_back(nativeDevice.info);
+		nativeDevice.handle = handle;
+		nativeDevice.uuid = uuid;
 		auto& cleanDevice = m_availableDevices.emplace_back();
 		memcpy(cleanDevice.uuid.data(),&uuid,cleanDevice.uuid.size());
 
@@ -345,6 +484,16 @@ uint32_t CCUDAHandler::getBuildCUDASDKVersion()
 	return CUDA_VERSION;
 }
 
+uint32_t CCUDAHandler::getLoadedCUDADriverVersion() const
+{
+	return m_native->cudaDriverVersion;
+}
+
+std::array<int,2> CCUDAHandler::getLoadedNVRTCVersion() const
+{
+	return m_native->nvrtcVersion;
+}
+
 const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const
 {
 	return m_native->cuda;
@@ -355,6 +504,14 @@ const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const
 	return m_native->nvrtc;
 }
 
+core::SRange<const char* const> CCUDAHandler::getDefaultRuntimeIncludeOptions() const
+{
+	if (m_native->runtimeIncludeOptionPtrs.empty())
+		return {nullptr,nullptr};
+	const auto* begin = m_native->runtimeIncludeOptionPtrs.data();
+	return {begin,begin+m_native->runtimeIncludeOptionPtrs.size()};
+}
+
 namespace cuda_native
 {
 
@@ -480,6 +637,11 @@ bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
 				This indicates that a PTX JIT compilation failed. 
 			)===",system::ILogger::ELL_ERROR);
 			break;
+		case CUDA_ERROR_UNSUPPORTED_PTX_VERSION:
+			logger.log(R"===(CCUDAHandler:
+				This indicates that the PTX version is unsupported by the CUDA driver. Check that the CUDA driver runtime can consume PTX produced by the loaded NVRTC runtime.
+			)===",system::ILogger::ELL_ERROR);
+			break;
 		case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
 			logger.log(R"===(CCUDAHandler:
 				This indicates an error with OpenGL or DirectX context. 
@@ -717,15 +879,25 @@ bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
 			break;
 		case CUDA_ERROR_UNKNOWN:
 		default:
-			logger.log("CCUDAHandler: Unknown CUDA Error!\n",system::ILogger::ELL_ERROR);
+			logger.log("CCUDAHandler: Unknown CUDA error code %d.",system::ILogger::ELL_ERROR,static_cast<int>(result));
 			break;
 	}
-	_NBL_DEBUG_BREAK_IF(true);
 	return false;
 }
 
 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result)
 {
+	if (result==CUDA_ERROR_UNSUPPORTED_PTX_VERSION)
+	{
+		const auto cudaVersion = handler.getLoadedCUDADriverVersion();
+		const auto nvrtcVersion = handler.getLoadedNVRTCVersion();
+		handler.getLogger().log(
+			"CCUDAHandler: CUDA driver API %d.%d rejected PTX produced through NVRTC %d.%d. Install a newer NVIDIA driver or use an NVRTC/runtime-header set compatible with the installed driver.",
+			system::ILogger::ELL_ERROR,
+			cudaVersionMajor(cudaVersion),cudaVersionMinor(cudaVersion),
+			nvrtcVersion[0],nvrtcVersion[1]
+		);
+	}
 	return defaultHandleResult(result,handler.getLogger());
 }
 
@@ -745,7 +917,6 @@ bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
 				logger.log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR);
 			break;
 	}
-	_NBL_DEBUG_BREAK_IF(true);
 	return false;
 }
 
@@ -764,39 +935,6 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 			#error "Unsuported Platform"
 		#endif
 	);
-	
-	cuda_native::NVRTC nvrtc = {};
-	#if defined(_NBL_WINDOWS_API_)
-	// Perpetual TODO: any new CUDA releases we need to account for?
-	// Version List: https://developer.nvidia.com/cuda-toolkit-archive
-	const char* nvrtc64_versions[] = {
-		"nvrtc64_132",
-		"nvrtc64_131",
-		"nvrtc64_130",
-		nullptr
-	};
-
-	const char* nvrtc64_suffices[] = {"","_","_0","_1","_2",nullptr};
-	for (auto verpath=nvrtc64_versions; *verpath; verpath++)
-	{
-		for (auto suffix=nvrtc64_suffices; *suffix; suffix++)
-		{
-			std::string path(*verpath);
-			path += *suffix;
-			nvrtc = cuda_native::NVRTC(path.c_str());
-			if (nvrtc.pnvrtcVersion)
-				break;
-		}
-		if (nvrtc.pnvrtcVersion)
-			break;
-	}
-	#elif defined(_NBL_POSIX_API_)
-	nvrtc = cuda_native::NVRTC("nvrtc");
-	//nvrtc_builtins = NVRTC("nvrtc-builtins");
-	#else
-	#error "Unsuported Platform"
-	#endif
-	
 
 	// need a complex safe calling chain because DLL/SO might not have loaded
 	#define SAFE_CUDA_CALL(FUNC,...) \
@@ -832,6 +970,86 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 	// stop the pollution
 	#undef SAFE_CUDA_CALL
 
+	auto readNVRTCVersion = [&](const cuda_native::NVRTC& candidate, std::array<int,2>& version, const char* name) -> bool
+	{
+		if (!candidate.pnvrtcVersion)
+			return false;
+
+		const auto result = candidate.pnvrtcVersion(version.data(),version.data()+1);
+		if (result==NVRTC_SUCCESS)
+			return true;
+
+		logger.log("CCUDAHandler: nvrtcVersion failed for %s with NVRTC error code %d.",system::ILogger::ELL_WARNING,name,static_cast<int>(result));
+		version = {-1,-1};
+		return false;
+	};
+
+	cuda_native::NVRTC nvrtc = {};
+	std::array<int,2> nvrtcVersion = {-1,-1};
+	std::string nvrtcLibraryName;
+
+	#if defined(_NBL_WINDOWS_API_)
+	cuda_native::NVRTC fallbackNVRTC = {};
+	std::array<int,2> fallbackNVRTCVersion = {-1,-1};
+	std::string fallbackNVRTCLibraryName;
+
+	/*
+		The CUDA driver consumes the final PTX, not the toolkit that provided headers or nvrtc*.dll.
+		A real machine can have an older NVIDIA driver and a newer CUDA Toolkit side by side, for example
+		driver API 13.1 from nvcuda.dll with CUDA 13.2 Toolkit/NVRTC in CUDA_PATH. In that setup NVRTC can
+		emit PTX the installed driver rejects with CUDA_ERROR_UNSUPPORTED_PTX_VERSION. Prefer an NVRTC runtime
+		that is not newer than the loaded driver and log the full version matrix when no compatible one exists.
+	*/
+	const char* nvrtc64_versions[] = {
+		"nvrtc64_132",
+		"nvrtc64_131",
+		"nvrtc64_130",
+		nullptr
+	};
+
+	const char* nvrtc64_suffices[] = {"","_","_0","_1","_2",nullptr};
+	for (auto verpath=nvrtc64_versions; *verpath && !nvrtc.pnvrtcVersion; verpath++)
+	{
+		for (auto suffix=nvrtc64_suffices; *suffix; suffix++)
+		{
+			std::string candidateName(*verpath);
+			candidateName += *suffix;
+
+			cuda_native::NVRTC candidate(candidateName.c_str());
+			std::array<int,2> candidateVersion = {-1,-1};
+			if (!readNVRTCVersion(candidate,candidateVersion,candidateName.c_str()))
+				continue;
+
+			if (cudaVersionCode(candidateVersion[0],candidateVersion[1])<=cudaVersion)
+			{
+				nvrtc = std::move(candidate);
+				nvrtcVersion = candidateVersion;
+				nvrtcLibraryName = std::move(candidateName);
+				break;
+			}
+
+			if (!fallbackNVRTC.pnvrtcVersion)
+			{
+				fallbackNVRTC = std::move(candidate);
+				fallbackNVRTCVersion = candidateVersion;
+				fallbackNVRTCLibraryName = std::move(candidateName);
+			}
+		}
+	}
+
+	if (!nvrtc.pnvrtcVersion && fallbackNVRTC.pnvrtcVersion)
+	{
+		nvrtc = std::move(fallbackNVRTC);
+		nvrtcVersion = fallbackNVRTCVersion;
+		nvrtcLibraryName = std::move(fallbackNVRTCLibraryName);
+	}
+	#elif defined(_NBL_POSIX_API_)
+	nvrtcLibraryName = "nvrtc";
+	nvrtc = cuda_native::NVRTC(nvrtcLibraryName.c_str());
+	readNVRTCVersion(nvrtc,nvrtcVersion,nvrtcLibraryName.c_str());
+	#else
+	#error "Unsuported Platform"
+	#endif
 
 	// check nvrtc existence and compatibility
 	if (!nvrtc.pnvrtcVersion)
@@ -839,13 +1057,6 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 		logger.log("CCUDAHandler: NVRTC runtime was not found. Need NVRTC %d.x or newer.",system::ILogger::ELL_ERROR,cuda_native::MinimumNVRTCMajorVersion);
 		return nullptr;
 	}
-	int nvrtcVersion[2] = { -1,-1 };
-	const auto nvrtcVersionResult = nvrtc.pnvrtcVersion(nvrtcVersion+0,nvrtcVersion+1);
-	if (nvrtcVersionResult!=NVRTC_SUCCESS)
-	{
-		logger.log("CCUDAHandler: nvrtcVersion failed with NVRTC error code %d.",system::ILogger::ELL_ERROR,static_cast<int>(nvrtcVersionResult));
-		return nullptr;
-	}
 	if (nvrtcVersion[0]<cuda_native::MinimumNVRTCMajorVersion)
 	{
 		logger.log(
@@ -856,6 +1067,51 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 		return nullptr;
 	}
 
+	const auto buildVersion = CCUDAHandler::getBuildCUDASDKVersion();
+	auto runtimeEnvironment = cuda_interop::findRuntimeCompileEnvironment();
+	const auto cudaDriverPath = loadedRuntimeModulePath("nvcuda");
+	const auto nvrtcPath = loadedRuntimeModulePath(nvrtcLibraryName.c_str());
+	const auto report = cudaRuntimeReport(buildVersion,cudaVersion,cudaDriverPath,nvrtcVersion,nvrtcLibraryName,nvrtcPath,runtimeEnvironment);
+	logger.log("%s",system::ILogger::ELL_INFO,report.c_str());
+
+	if (cudaVersionCode(nvrtcVersion[0],nvrtcVersion[1])>cudaVersion)
+	{
+		logger.log(
+			"CCUDAHandler: NVRTC runtime %d.%d is newer than CUDA driver API %d.%d. PTX generated by this NVRTC may be unsupported by the installed driver.",
+			system::ILogger::ELL_WARNING,
+			nvrtcVersion[0],nvrtcVersion[1],
+			cudaVersionMajor(cudaVersion),cudaVersionMinor(cudaVersion)
+		);
+	}
+	if (runtimeEnvironment.includeDirs.empty())
+	{
+		logger.log("CCUDAHandler: no CUDA runtime headers were discovered for NVRTC include paths.",system::ILogger::ELL_WARNING);
+	}
+	else
+	{
+		const auto& primaryIncludeDir = runtimeEnvironment.includeDirInfos.front();
+		if (!primaryIncludeDir.completeRuntimeHeaderSet)
+		{
+			logger.log(
+				"CCUDAHandler: primary NVRTC runtime header path %s does not contain cuda.h, cuda_runtime_api.h, and vector_types.h together. NVRTC may use later include paths for missing headers.",
+				system::ILogger::ELL_WARNING,
+				primaryIncludeDir.path.generic_string().c_str()
+			);
+		}
+
+		const auto nvrtcVersionCode = cudaVersionCode(nvrtcVersion[0],nvrtcVersion[1]);
+		if (primaryIncludeDir.cudaVersion!=0u && primaryIncludeDir.cudaVersion!=static_cast<uint32_t>(nvrtcVersionCode))
+		{
+			logger.log(
+				"CCUDAHandler: primary NVRTC runtime headers report CUDA_VERSION %u (%s), while loaded NVRTC is %s. This is allowed by discovery policy, but kernels using version-specific CUDA headers may fail to compile.",
+				system::ILogger::ELL_WARNING,
+				primaryIncludeDir.cudaVersion,
+				cudaVersionString(primaryIncludeDir.cudaVersion).c_str(),
+				cudaVersionString(nvrtcVersion).c_str()
+			);
+		}
+	}
+
 	// add headers
 	core::vector<core::smart_refctd_ptr<system::IFile>> headers;
 	for (const auto& it : jitify::detail::get_jitsafe_headers_map())
@@ -864,14 +1120,13 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 		headers.push_back(core::make_smart_refctd_ptr<system::CFileView<system::CNullAllocator>>(
 			it.first.c_str(),
 			core::bitflag(system::IFile::ECF_READ)|system::IFile::ECF_MAPPABLE,
-			// ASK(kevin): What initial_modified_time should I use? Is this how this parameter is used?
 			std::chrono::clock_cast<system::IFile::time_point_t::clock>(std::chrono::system_clock::now()),
 			const_cast<void*>(contents),it.second.size()+1u
 		));
 	}
 
 	return core::smart_refctd_ptr<CCUDAHandler>(
-		new CCUDAHandler(std::make_unique<SNativeState>(std::move(cuda),std::move(nvrtc)),std::move(headers),std::move(_logger)),
+		new CCUDAHandler(std::make_unique<SNativeState>(std::move(cuda),std::move(nvrtc),cudaVersion,nvrtcVersion,std::move(runtimeEnvironment)),std::move(headers),std::move(_logger)),
 		core::dont_grab
 	);
 }
@@ -927,25 +1182,19 @@ SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
 	return {std::move(ptx),nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
 }
 
-static const core::vector<std::string>& getDefaultRuntimeIncludeOptions()
-{
-	static const auto RuntimeIncludeOptions = cuda_interop::makeNVRTCIncludeOptions(cuda_interop::findRuntimeCompileEnvironment());
-	return RuntimeIncludeOptions;
-}
-
 static SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string& log)
 {
 	log.clear();
 	if (result!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
-	const auto& runtimeIncludeOptions = getDefaultRuntimeIncludeOptions();
+	const auto runtimeIncludeOptions = handler.getDefaultRuntimeIncludeOptions();
 	core::vector<const char*> options;
 	options.reserve(nvrtcOptions.size()+runtimeIncludeOptions.size());
 	for (const auto option : nvrtcOptions)
 		options.push_back(option);
-	for (const auto& option : runtimeIncludeOptions)
-		options.push_back(option.c_str());
+	for (const auto option : runtimeIncludeOptions)
+		options.push_back(option);
 
 	const auto* optionsBegin = options.empty() ? nullptr:options.data();
 	const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size();
@@ -985,7 +1234,7 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 
 	for (const auto& device : m_native->deviceStates)
 	{
-		if (!memcmp(&device.info.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE))
+		if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE))
 		{
 			CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT;
 			const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR];
@@ -1067,10 +1316,13 @@ core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refct
 			if (arch==CCUDADevice::EVA_COUNT)
 				continue;
 
-			return core::smart_refctd_ptr<CCUDADevice>(
-				new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique<CCUDADevice::SNativeState>(device.info.handle),core::smart_refctd_ptr<CCUDAHandler>(this)),
+			auto cudaDevice = core::smart_refctd_ptr<CCUDADevice>(
+				new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique<CCUDADevice::SNativeState>(device.handle),core::smart_refctd_ptr<CCUDAHandler>(this)),
 				core::dont_grab
 			);
+			if (!cudaDevice->isValid())
+				return nullptr;
+			return std::move(cudaDevice);
 		}
 	}
 	return nullptr;
@@ -1104,6 +1356,16 @@ uint32_t CCUDAHandler::getBuildCUDASDKVersion()
 	return 0u;
 }
 
+uint32_t CCUDAHandler::getLoadedCUDADriverVersion() const
+{
+	return 0u;
+}
+
+std::array<int,2> CCUDAHandler::getLoadedNVRTCVersion() const
+{
+	return {-1,-1};
+}
+
 const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const
 {
 	std::abort();
@@ -1114,6 +1376,11 @@ const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const
 	std::abort();
 }
 
+core::SRange<const char* const> CCUDAHandler::getDefaultRuntimeIncludeOptions() const
+{
+	return {nullptr,nullptr};
+}
+
 core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr<system::ILogger>&&)
 {
 	return nullptr;
diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp
index 8ccad3e119..54a710e48c 100644
--- a/src/nbl/video/CCUDAImportedMemory.cpp
+++ b/src/nbl/video/CCUDAImportedMemory.cpp
@@ -20,31 +20,31 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> dev
 
 cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const
 {
-  return cuda_native::SCUexternalMemory(m_native->handle);
+	return cuda_interop::SNativeHandle<cuda_interop::SCUexternalMemory,CUexternalMemory>(m_native->handle);
 }
 
 bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const
 {
-  if (!mappedBuffer)
-	  return false;
-
-  CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {};
-  bufferDesc.offset = 0;
-  bufferDesc.size = m_src->getAllocationSize();
-
-  CUdeviceptr nativeMappedBuffer = 0;
-  const auto& cu = m_device->getHandler()->getCUDAFunctionTable();
-  const auto result = cu.pcuExternalMemoryGetMappedBuffer(&nativeMappedBuffer, m_native->handle, &bufferDesc);
-  if (!cuda_native::defaultHandleResult(*m_device->getHandler(),result))
-	  return false;
-  
-  *mappedBuffer = cuda_native::SCUdeviceptr(nativeMappedBuffer);
-  return true;
+	if (!mappedBuffer)
+		return false;
+
+	CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {};
+	bufferDesc.offset = 0;
+	bufferDesc.size = m_src->getAllocationSize();
+
+	CUdeviceptr nativeMappedBuffer = 0;
+	const auto& cu = m_device->getHandler()->getCUDAFunctionTable();
+	const auto result = cu.pcuExternalMemoryGetMappedBuffer(&nativeMappedBuffer, m_native->handle, &bufferDesc);
+	if (!cuda_native::defaultHandleResult(*m_device->getHandler(),result))
+		return false;
+
+	*mappedBuffer = cuda_interop::SNativeHandle<cuda_interop::SCUdeviceptr,CUdeviceptr>(nativeMappedBuffer);
+	return true;
 }
 
 CCUDAImportedMemory::~CCUDAImportedMemory()
 {
-  auto& cu = m_device->getHandler()->getCUDAFunctionTable();
+	auto& cu = m_device->getHandler()->getCUDAFunctionTable();
 	cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle));
 }
 
diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp
index d495f979ab..1afd4a10b1 100644
--- a/src/nbl/video/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/video/CCUDAImportedSemaphore.cpp
@@ -19,7 +19,7 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevic
 
 cuda_interop::SCUexternalSemaphore CCUDAImportedSemaphore::getInternalObject() const
 {
-	return cuda_native::SCUexternalSemaphore(m_native->handle);
+	return cuda_interop::SNativeHandle<cuda_interop::SCUexternalSemaphore,CUexternalSemaphore>(m_native->handle);
 }
 
 CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp
index 3a1500e77e..04384336d1 100644
--- a/src/nbl/video/CUDAInteropNativeState.hpp
+++ b/src/nbl/video/CUDAInteropNativeState.hpp
@@ -1,9 +1,10 @@
 #ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
 #define _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_
 
-#include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
+#include "nbl/video/CUDAInteropNativeAPI.h"
 
 #include <array>
+#include <utility>
 
 namespace nbl::video
 {
@@ -12,18 +13,33 @@ struct CCUDAHandler::SNativeState
 {
 	struct SDeviceState
 	{
-		cuda_native::SCUDADeviceInfo info = {};
+		CUdevice handle = {};
+		CUuuid uuid = {};
 		std::array<int,CU_DEVICE_ATTRIBUTE_MAX> attributes = {};
 	};
 
 	cuda_native::CUDA cuda;
 	cuda_native::NVRTC nvrtc;
-	core::vector<cuda_native::SCUDADeviceInfo> availableDevices;
+	int cudaDriverVersion = 0;
+	std::array<int,2> nvrtcVersion = {-1,-1};
+	// Snapshot discovery at handler creation so diagnostics and NVRTC compile options describe the same runtime setup.
+	cuda_interop::SRuntimeCompileEnvironment runtimeEnvironment;
+	core::vector<std::string> runtimeIncludeOptions;
+	core::vector<const char*> runtimeIncludeOptionPtrs;
 	core::vector<SDeviceState> deviceStates;
 
-	SNativeState(cuda_native::CUDA&& _cuda, cuda_native::NVRTC&& _nvrtc)
+	SNativeState(
+		cuda_native::CUDA&& _cuda,
+		cuda_native::NVRTC&& _nvrtc,
+		int _cudaDriverVersion,
+		std::array<int,2> _nvrtcVersion,
+		cuda_interop::SRuntimeCompileEnvironment&& _runtimeEnvironment)
 		: cuda(std::move(_cuda))
 		, nvrtc(std::move(_nvrtc))
+		, cudaDriverVersion(_cudaDriverVersion)
+		, nvrtcVersion(_nvrtcVersion)
+		, runtimeEnvironment(std::move(_runtimeEnvironment))
+		, runtimeIncludeOptions(cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment))
 	{}
 };
 

From f2f62ce5985f65b81f00ed95949f4180de0678d1 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sun, 10 May 2026 15:38:06 +0200
Subject: [PATCH 33/51] Polish CUDA interop review feedback

---
 examples_tests                                |  2 +-
 include/nbl/video/CCUDADevice.h               |  2 +-
 include/nbl/video/CCUDAExportableMemory.h     |  2 -
 include/nbl/video/CCUDAHandler.h              | 10 +++--
 include/nbl/video/CUDAInteropHandles.h        |  2 +
 include/nbl/video/CUDAInteropNativeAPI.h      | 16 ++++++++
 include/nbl/video/EApiType.h                  |  1 -
 include/nbl/video/declarations.h              |  1 +
 src/nbl/ext/CUDAInterop/README.md             | 12 ++++--
 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt  |  3 --
 .../ext/CUDAInterop/smoke/clean_opt_in.cpp    | 39 -------------------
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |  8 ++--
 .../ext/CUDAInterop/smoke/public_boundary.cpp | 22 +++--------
 src/nbl/video/CCUDADevice.cpp                 |  4 +-
 src/nbl/video/CCUDAHandler.cpp                | 34 +++++++++-------
 src/nbl/video/CUDAInteropNativeState.hpp      |  4 +-
 16 files changed, 70 insertions(+), 92 deletions(-)
 delete mode 100644 src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp

diff --git a/examples_tests b/examples_tests
index d373d313d3..a6268bc995 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit d373d313d3e70579d650c7804af8a2785cfede9a
+Subproject commit a6268bc9953b8d8a795b3b2eee8dbd897b05706e
diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index 56e81d4b2f..57a8b5262a 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -87,7 +87,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 		const system::logger_opt_ptr m_logger;
 		std::vector<const char*> m_defaultCompileOptions;
 		core::smart_refctd_ptr<CVulkanConnection> m_vulkanConnection;
-		std::array<size_t,5> m_allocationGranularity = {};
+		std::array<size_t,cuda_interop::AllocationGranularityLocationTypeCount> m_allocationGranularity = {};
 		E_VIRTUAL_ARCHITECTURE m_virtualArchitecture;
 		bool m_valid = false;
 
diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h
index 510f483b3b..f1ae7f6031 100644
--- a/include/nbl/video/CCUDAExportableMemory.h
+++ b/include/nbl/video/CCUDAExportableMemory.h
@@ -19,8 +19,6 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted
 	public:
 		struct SCachedCreationParams
 		{
-			size_t size;
-			uint32_t alignment;
 			size_t granularSize;
 			external_handle_t externalHandle;
 			bool deviceLocal;
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index e69792b217..578d720546 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -23,6 +23,7 @@ class IPhysicalDevice;
 
 namespace cuda_native
 {
+// SDK-free forward declarations for the dynamic CUDA/NVRTC tables exposed by the opt-in native header.
 class CUDA;
 class NVRTC;
 }
@@ -30,6 +31,8 @@ class NVRTC;
 namespace cuda_interop
 {
 inline constexpr const char* RuntimePathsFileName = "nbl_cuda_interop_runtime.json";
+inline constexpr uint32_t RuntimeVersionComponentCount = 2u;
+using SRuntimeVersion = std::array<int,RuntimeVersionComponentCount>;
 
 struct SRuntimeIncludeDir
 {
@@ -45,8 +48,9 @@ struct SRuntimeCompileEnvironment
 	core::vector<SRuntimeIncludeDir> includeDirInfos;
 };
 
-NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs = {});
-NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs, core::vector<system::path> runtimePathFiles);
+NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment();
+NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector<system::path>& explicitIncludeDirs);
+NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector<system::path>& explicitIncludeDirs, const core::vector<system::path>& runtimePathFiles);
 inline core::vector<std::string> makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment)
 {
 	core::vector<std::string> options;
@@ -62,7 +66,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 		static core::smart_refctd_ptr<CCUDAHandler> create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger);
 		static uint32_t getBuildCUDASDKVersion();
 		uint32_t getLoadedCUDADriverVersion() const;
-		std::array<int,2> getLoadedNVRTCVersion() const;
+		cuda_interop::SRuntimeVersion getLoadedNVRTCVersion() const;
 		const cuda_native::CUDA& getCUDAFunctionTable() const;
 		const cuda_native::NVRTC& getNVRTCFunctionTable() const;
 		core::SRange<const char* const> getDefaultRuntimeIncludeOptions() const;
diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h
index 987a130ad1..0b3cc9f488 100644
--- a/include/nbl/video/CUDAInteropHandles.h
+++ b/include/nbl/video/CUDAInteropHandles.h
@@ -11,6 +11,8 @@
 namespace nbl::video::cuda_interop
 {
 
+inline constexpr uint32_t AllocationGranularityLocationTypeCount = 5u;
+
 /*
 	SDK-free CUDA handle surrogates used by Nabla's public video API.
 
diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h
index 52dad41f09..37d8e0ec2d 100644
--- a/include/nbl/video/CUDAInteropNativeAPI.h
+++ b/include/nbl/video/CUDAInteropNativeAPI.h
@@ -4,6 +4,7 @@
 #ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_API_H_INCLUDED_
 #define _NBL_VIDEO_CUDA_INTEROP_NATIVE_API_H_INCLUDED_
 
+#include <cassert>
 #include <string>
 
 #include "nbl/video/CUDAInterop.h"
@@ -19,6 +20,11 @@ namespace nbl::video::cuda_native
 inline constexpr int MinimumCUDADriverVersion = 13000;
 inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000;
 static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or higher.");
+static_assert(CU_MEM_LOCATION_TYPE_INVALID==0);
+static_assert(CU_MEM_LOCATION_TYPE_DEVICE==1);
+static_assert(CU_MEM_LOCATION_TYPE_HOST==2);
+static_assert(CU_MEM_LOCATION_TYPE_HOST_NUMA==3);
+static_assert(CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT+1==cuda_interop::AllocationGranularityLocationTypeCount);
 
 /*
 	Low-level CUDA SDK boundary shared by Nabla's CUDA implementation and explicit CUDA interop opt-in users.
@@ -177,6 +183,16 @@ struct SPTXResult
 NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
 NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
 NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
+
+// Opt-in convenience for examples/tests that intentionally assert on failures. Pass a CCUDAHandler reference.
+// Nabla implementation code should prefer explicit error handling paths.
+#define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \
+	do { \
+		const auto nblCudaInteropResult = (expr); \
+		if (!::nbl::video::cuda_native::defaultHandleResult((handler),nblCudaInteropResult)) \
+			assert(false); \
+	} while (false)
+
 NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
 NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h
index 9b1a79e4d4..89be885b0f 100644
--- a/include/nbl/video/EApiType.h
+++ b/include/nbl/video/EApiType.h
@@ -1,7 +1,6 @@
 #ifndef __NBL_E_API_TYPE_H_INCLUDED__
 #define __NBL_E_API_TYPE_H_INCLUDED__
 
-#include "nbl/core/declarations.h"
 #include <cstdint>
 #ifdef _WIN32
 	#ifndef WIN32_LEAN_AND_MEAN
diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h
index 4393af1768..1a74514714 100644
--- a/include/nbl/video/declarations.h
+++ b/include/nbl/video/declarations.h
@@ -36,6 +36,7 @@
 #include "nbl/video/utilities/CSmoothResizeSurface.h"
 #include "nbl/video/utilities/CDefaultSwapchainFramebuffers.h"
 #include "nbl/video/utilities/CAssetConverter.h"
+#include "nbl/video/CUDAInterop.h"
 
 //VT
 //#include "nbl/video/IGPUVirtualTexture.h"
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 5677db046f..dff708aff6 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -68,14 +68,18 @@ auto memory = cudaDevice->createExportableMemory({
     .locationType = CU_MEM_LOCATION_TYPE_DEVICE,
 });
 
+auto& cu = handler->getCUDAFunctionTable();
+auto& nvrtc = handler->getNVRTCFunctionTable();
+int driverVersion = 0;
+NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuDriverGetVersion(&driverVersion), *handler);
+
 nbl::video::cuda_native::SCUdeviceptr mapped;
 if (importedMemory)
     importedMemory->getMappedBuffer(mapped);
 
+nbl::video::cuda_native::SCUdeviceptr exported = memory->getDeviceptr();
 CUdeviceptr rawMapped = mapped;
-CUdeviceptr rawExported = nbl::video::cuda_native::SCUdeviceptr(memory->getDeviceptr());
-auto& cu = handler->getCUDAFunctionTable();
-auto& nvrtc = handler->getNVRTCFunctionTable();
+CUdeviceptr rawExported = exported;
 
 std::string log;
 auto compile = nbl::video::cuda_native::compileDirectlyToPTX(
@@ -102,12 +106,12 @@ if (pcuNewCall)
 - CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI.
 - `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. SDK opt-in code can pass `cuda_native::SCUdeviceptr` directly and then use it as `CUdeviceptr`.
 - `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`.
+- `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handlerRef)` is available for tests/examples that intentionally assert on CUDA/NVRTC failures. Pass a `CCUDAHandler&`. Nabla implementation code should still prefer explicit error handling and clean returns.
 - `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule.
 
 Smoke examples:
 
 - `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` checks that `Nabla::Nabla` headers stay SDK-free.
-- `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` checks `Nabla::Nabla` package usage without SDK opt-in.
 - `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks SDK opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, extra dynamic symbol loading, and raw interop usage.
 
 ## ABI
diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
index 7118eeff09..e16d3feac0 100644
--- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
+++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt
@@ -24,9 +24,6 @@ endfunction()
 nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp)
 target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla)
 
-nbl_add_cuda_interop_smoke(NblExtCUDAInteropCleanNablaSmoke clean_opt_in.cpp)
-target_link_libraries(NblExtCUDAInteropCleanNablaSmoke PRIVATE Nabla::Nabla)
-
 if(TARGET Nabla::ext::CUDAInterop)
 	nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp)
 	set(_nbl_cuda_interop_smoke_args PRIVATE)
diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
deleted file mode 100644
index 31bf461804..0000000000
--- a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "nbl/video/CUDAInterop.h"
-#include "nbl/system/IApplicationFramework.h"
-
-#include <type_traits>
-
-#ifdef _NBL_COMPILE_WITH_CUDA_
-#error "Nabla::Nabla must not propagate the CUDA build define."
-#endif
-
-#ifdef CUDA_VERSION
-#error "Nabla::Nabla must not require CUDA SDK headers."
-#endif
-
-namespace
-{
-
-class CUDAInteropCleanOptInSmoke final : public nbl::system::IApplicationFramework
-{
-	using base_t = nbl::system::IApplicationFramework;
-
-public:
-	using base_t::base_t;
-
-	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override
-	{
-		static_assert(std::is_class_v<nbl::video::CCUDADevice>);
-		static_assert(std::is_class_v<nbl::video::CCUDAExportableMemory>);
-		static_assert(std::is_class_v<nbl::video::CCUDAImportedMemory>);
-		static_assert(std::is_class_v<nbl::video::CCUDAImportedSemaphore>);
-		return isAPILoaded();
-	}
-
-	void workLoopBody() override {}
-	bool keepRunning() override { return false; }
-};
-
-}
-
-NBL_MAIN_FUNC(CUDAInteropCleanOptInSmoke)
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index 79e85555b7..416b829fb1 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -39,8 +39,8 @@ using namespace nbl::video;
 	if (importedFromVulkan)
 		importedFromVulkan->getMappedBuffer(mappedVulkanMemory);
 
-	const CUdeviceptr cudaDevicePtr = cuda_native::SCUdeviceptr(cudaMemory->getDeviceptr());
-	CUexternalSemaphore cudaSemaphore = nullptr;
+	const cuda_native::SCUdeviceptr cudaDevicePtr = cudaMemory->getDeviceptr();
+	cuda_native::SCUexternalSemaphore cudaSemaphore;
 	if (importedSemaphore)
 		cudaSemaphore = cuda_native::SCUexternalSemaphore(importedSemaphore->getInternalObject());
 	return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore;
@@ -130,7 +130,9 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 		[[maybe_unused]] const bool exactBuildSDK = nbl::video::cuda_native::isBuildCUDASDKVersionExactMatch();
 
 		#ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON
-		const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment({}, {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON});
+		const nbl::core::vector<nbl::system::path> explicitIncludeDirs;
+		const nbl::core::vector<nbl::system::path> runtimePathFiles = {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON};
+		const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment(explicitIncludeDirs, runtimePathFiles);
 		if (!std::filesystem::exists(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON))
 			return false;
 		#else
diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
index dc1c247806..73307599b1 100644
--- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp
@@ -1,24 +1,8 @@
 #include "nabla.h"
 
-#ifdef _NBL_COMPILE_WITH_CUDA_
-#error "Nabla consumers must not get the CUDA opt-in define."
-#endif
-
-#ifdef CUDA_VERSION
-#error "Nabla consumers must not include CUDA SDK headers."
-#endif
-
 #include "nbl/system/IApplicationFramework.h"
-
-#ifdef _NBL_COMPILE_WITH_CUDA_
-#error "Nabla consumers must not get the CUDA opt-in define."
-#endif
-
-#ifdef CUDA_VERSION
-#error "Nabla consumers must not include CUDA SDK headers."
-#endif
-
 #include "nbl/video/CUDAInterop.h"
+#include <type_traits>
 
 #ifdef _NBL_COMPILE_WITH_CUDA_
 #error "Nabla consumers must not get the CUDA opt-in define."
@@ -40,6 +24,10 @@ class CUDAInteropPublicBoundarySmoke final : public nbl::system::IApplicationFra
 
 	bool onAppInitialized(nbl::core::smart_refctd_ptr<nbl::system::ISystem>&&) override
 	{
+		static_assert(std::is_class_v<nbl::video::CCUDADevice>);
+		static_assert(std::is_class_v<nbl::video::CCUDAExportableMemory>);
+		static_assert(std::is_class_v<nbl::video::CCUDAImportedMemory>);
+		static_assert(std::is_class_v<nbl::video::CCUDAImportedSemaphore>);
 		return isAPILoaded();
 	}
 
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index 0178f31fc7..29a6562640 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -167,8 +167,6 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 	const auto location = static_cast<CUmemLocationType>(inParams.locationType);
 
 	CCUDAExportableMemory::SCachedCreationParams params = {
-		.size = inParams.size,
-		.alignment = inParams.alignment,
 		.granularSize = roundToGranularity(inParams.locationType, inParams.size),
 		.deviceLocal = isDeviceLocal(location)
 	};
@@ -208,7 +206,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 		return nullptr;
 	}
 
-	if (const auto err = reserveAddressAndMapMemory(*handler,m_native->handle,&nativeState->ptr, params.granularSize, params.alignment, location, mem); CUDA_SUCCESS != err)
+	if (const auto err = reserveAddressAndMapMemory(*handler,m_native->handle,&nativeState->ptr, params.granularSize, inParams.alignment, location, mem); CUDA_SUCCESS != err)
 	{
 		m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
 
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 6d8b2ffb70..9305cf83c0 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -299,7 +299,7 @@ void appendSystemIncludeDirs(SRuntimeCompileEnvironment& environment)
 
 }
 
-SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs, core::vector<system::path> runtimePathFiles)
+SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector<system::path>& explicitIncludeDirs, const core::vector<system::path>& runtimePathFiles)
 {
 	SRuntimeCompileEnvironment environment;
 
@@ -314,8 +314,8 @@ SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::pa
 		This is also why mismatched or partial roots produce diagnostics instead of changing discovery order or
 		hard-failing before the user kernel is compiled.
 	*/
-	for (auto& includeDir : explicitIncludeDirs)
-		appendIncludeDir(environment,std::move(includeDir),"explicit include dir");
+	for (const auto& includeDir : explicitIncludeDirs)
+		appendIncludeDir(environment,includeDir,"explicit include dir");
 
 	appendRuntimePathsConfigs(environment,runtimePathFiles);
 	appendAppLocalIncludeDirs(environment);
@@ -325,9 +325,17 @@ SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::pa
 	return environment;
 }
 
-SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector<system::path> explicitIncludeDirs)
+SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector<system::path>& explicitIncludeDirs)
 {
-	return findRuntimeCompileEnvironment(std::move(explicitIncludeDirs),{});
+	static const core::vector<system::path> EmptyRuntimePathFiles;
+	return findRuntimeCompileEnvironment(explicitIncludeDirs,EmptyRuntimePathFiles);
+}
+
+SRuntimeCompileEnvironment findRuntimeCompileEnvironment()
+{
+	static const core::vector<system::path> EmptyIncludeDirs;
+	static const core::vector<system::path> EmptyRuntimePathFiles;
+	return findRuntimeCompileEnvironment(EmptyIncludeDirs,EmptyRuntimePathFiles);
 }
 
 }
@@ -378,7 +386,7 @@ std::string cudaVersionString(int version)
 	return stream.str();
 }
 
-std::string cudaVersionString(const std::array<int,2>& version)
+std::string cudaVersionString(const cuda_interop::SRuntimeVersion& version)
 {
 	std::ostringstream stream;
 	stream << version[0] << "." << version[1];
@@ -401,7 +409,7 @@ std::string runtimeIncludeDirDescription(const cuda_interop::SRuntimeIncludeDir&
 
 std::string cudaRuntimeReport(
 	const int buildVersion, const int cudaDriverVersion, const system::path& cudaDriverPath,
-	const std::array<int,2>& nvrtcVersion, const std::string& nvrtcLibraryName, const system::path& nvrtcPath,
+	const cuda_interop::SRuntimeVersion& nvrtcVersion, const std::string& nvrtcLibraryName, const system::path& nvrtcPath,
 	const cuda_interop::SRuntimeCompileEnvironment& runtimeEnvironment)
 {
 	std::ostringstream stream;
@@ -489,7 +497,7 @@ uint32_t CCUDAHandler::getLoadedCUDADriverVersion() const
 	return m_native->cudaDriverVersion;
 }
 
-std::array<int,2> CCUDAHandler::getLoadedNVRTCVersion() const
+cuda_interop::SRuntimeVersion CCUDAHandler::getLoadedNVRTCVersion() const
 {
 	return m_native->nvrtcVersion;
 }
@@ -970,7 +978,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 	// stop the pollution
 	#undef SAFE_CUDA_CALL
 
-	auto readNVRTCVersion = [&](const cuda_native::NVRTC& candidate, std::array<int,2>& version, const char* name) -> bool
+	auto readNVRTCVersion = [&](const cuda_native::NVRTC& candidate, cuda_interop::SRuntimeVersion& version, const char* name) -> bool
 	{
 		if (!candidate.pnvrtcVersion)
 			return false;
@@ -985,12 +993,12 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 	};
 
 	cuda_native::NVRTC nvrtc = {};
-	std::array<int,2> nvrtcVersion = {-1,-1};
+	cuda_interop::SRuntimeVersion nvrtcVersion = {-1,-1};
 	std::string nvrtcLibraryName;
 
 	#if defined(_NBL_WINDOWS_API_)
 	cuda_native::NVRTC fallbackNVRTC = {};
-	std::array<int,2> fallbackNVRTCVersion = {-1,-1};
+	cuda_interop::SRuntimeVersion fallbackNVRTCVersion = {-1,-1};
 	std::string fallbackNVRTCLibraryName;
 
 	/*
@@ -1016,7 +1024,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 			candidateName += *suffix;
 
 			cuda_native::NVRTC candidate(candidateName.c_str());
-			std::array<int,2> candidateVersion = {-1,-1};
+			cuda_interop::SRuntimeVersion candidateVersion = {-1,-1};
 			if (!readNVRTCVersion(candidate,candidateVersion,candidateName.c_str()))
 				continue;
 
@@ -1361,7 +1369,7 @@ uint32_t CCUDAHandler::getLoadedCUDADriverVersion() const
 	return 0u;
 }
 
-std::array<int,2> CCUDAHandler::getLoadedNVRTCVersion() const
+cuda_interop::SRuntimeVersion CCUDAHandler::getLoadedNVRTCVersion() const
 {
 	return {-1,-1};
 }
diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp
index 04384336d1..04a70c6e4e 100644
--- a/src/nbl/video/CUDAInteropNativeState.hpp
+++ b/src/nbl/video/CUDAInteropNativeState.hpp
@@ -21,7 +21,7 @@ struct CCUDAHandler::SNativeState
 	cuda_native::CUDA cuda;
 	cuda_native::NVRTC nvrtc;
 	int cudaDriverVersion = 0;
-	std::array<int,2> nvrtcVersion = {-1,-1};
+	cuda_interop::SRuntimeVersion nvrtcVersion = {-1,-1};
 	// Snapshot discovery at handler creation so diagnostics and NVRTC compile options describe the same runtime setup.
 	cuda_interop::SRuntimeCompileEnvironment runtimeEnvironment;
 	core::vector<std::string> runtimeIncludeOptions;
@@ -32,7 +32,7 @@ struct CCUDAHandler::SNativeState
 		cuda_native::CUDA&& _cuda,
 		cuda_native::NVRTC&& _nvrtc,
 		int _cudaDriverVersion,
-		std::array<int,2> _nvrtcVersion,
+		cuda_interop::SRuntimeVersion _nvrtcVersion,
 		cuda_interop::SRuntimeCompileEnvironment&& _runtimeEnvironment)
 		: cuda(std::move(_cuda))
 		, nvrtc(std::move(_nvrtc))

From 9c504a14a63527f01cdf324672d7ac8c47e86749 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sun, 10 May 2026 17:11:06 +0200
Subject: [PATCH 34/51] Polish CUDA interop native header

---
 include/nbl/video/CCUDADevice.h          |  3 ++-
 include/nbl/video/CUDAInteropHandles.h   |  2 --
 include/nbl/video/CUDAInteropNativeAPI.h | 22 +++++++---------------
 3 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h
index 57a8b5262a..d6a1378dcb 100644
--- a/include/nbl/video/CCUDADevice.h
+++ b/include/nbl/video/CCUDADevice.h
@@ -80,6 +80,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 	private:
 		friend class CCUDAHandler;
 
+		static constexpr uint32_t AllocationGranularityLocationTypeCount = 5u;
 		struct SNativeState;
 		CCUDADevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr<SNativeState>&& nativeState, core::smart_refctd_ptr<CCUDAHandler>&& handler);
 		bool isValid() const;
@@ -87,7 +88,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted
 		const system::logger_opt_ptr m_logger;
 		std::vector<const char*> m_defaultCompileOptions;
 		core::smart_refctd_ptr<CVulkanConnection> m_vulkanConnection;
-		std::array<size_t,cuda_interop::AllocationGranularityLocationTypeCount> m_allocationGranularity = {};
+		std::array<size_t,AllocationGranularityLocationTypeCount> m_allocationGranularity = {};
 		E_VIRTUAL_ARCHITECTURE m_virtualArchitecture;
 		bool m_valid = false;
 
diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h
index 0b3cc9f488..987a130ad1 100644
--- a/include/nbl/video/CUDAInteropHandles.h
+++ b/include/nbl/video/CUDAInteropHandles.h
@@ -11,8 +11,6 @@
 namespace nbl::video::cuda_interop
 {
 
-inline constexpr uint32_t AllocationGranularityLocationTypeCount = 5u;
-
 /*
 	SDK-free CUDA handle surrogates used by Nabla's public video API.
 
diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h
index 37d8e0ec2d..eb75f0eec3 100644
--- a/include/nbl/video/CUDAInteropNativeAPI.h
+++ b/include/nbl/video/CUDAInteropNativeAPI.h
@@ -20,11 +20,6 @@ namespace nbl::video::cuda_native
 inline constexpr int MinimumCUDADriverVersion = 13000;
 inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000;
 static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or higher.");
-static_assert(CU_MEM_LOCATION_TYPE_INVALID==0);
-static_assert(CU_MEM_LOCATION_TYPE_DEVICE==1);
-static_assert(CU_MEM_LOCATION_TYPE_HOST==2);
-static_assert(CU_MEM_LOCATION_TYPE_HOST_NUMA==3);
-static_assert(CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT+1==cuda_interop::AllocationGranularityLocationTypeCount);
 
 /*
 	Low-level CUDA SDK boundary shared by Nabla's CUDA implementation and explicit CUDA interop opt-in users.
@@ -183,16 +178,6 @@ struct SPTXResult
 NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
 NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
 NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
-
-// Opt-in convenience for examples/tests that intentionally assert on failures. Pass a CCUDAHandler reference.
-// Nabla implementation code should prefer explicit error handling paths.
-#define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \
-	do { \
-		const auto nblCudaInteropResult = (expr); \
-		if (!::nbl::video::cuda_native::defaultHandleResult((handler),nblCudaInteropResult)) \
-			assert(false); \
-	} while (false)
-
 NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
 NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
@@ -202,6 +187,13 @@ NBL_API2 SPTXResult compileDirectlyToPTX(
 	std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
 );
 
+#define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \
+	do { \
+		const auto nblCudaInteropResult = (expr); \
+		if (!::nbl::video::cuda_native::defaultHandleResult((handler),nblCudaInteropResult)) \
+			assert(false); \
+	} while (false)
+
 }
 
 #endif

From 0df750788774a7c8da94a9b4d14a649d3f7b4761 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sun, 10 May 2026 19:42:13 +0200
Subject: [PATCH 35/51] Use opaque CUDA interop handles

---
 examples_tests                                |   2 +-
 .../nbl/ext/CUDAInterop/CUDAInteropNative.h   |  25 +---
 include/nbl/video/CCUDAHandler.h              |  32 +++++
 include/nbl/video/CCUDAImportedMemory.h       |  17 +++
 include/nbl/video/CUDAInteropHandles.h        | 113 +++++++++++++-----
 include/nbl/video/CUDAInteropNativeAPI.h      |  57 ++++-----
 src/nbl/ext/CUDAInterop/README.md             |  23 ++--
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   |  14 +--
 src/nbl/video/CCUDADevice.cpp                 |  28 ++---
 src/nbl/video/CCUDAExportableMemory.cpp       |   6 +-
 src/nbl/video/CCUDAHandler.cpp                |  92 +++++++-------
 src/nbl/video/CCUDAImportedMemory.cpp         |   8 +-
 src/nbl/video/CCUDAImportedSemaphore.cpp      |   4 +-
 13 files changed, 249 insertions(+), 172 deletions(-)

diff --git a/examples_tests b/examples_tests
index a6268bc995..eb8f44a1b5 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit a6268bc9953b8d8a795b3b2eee8dbd897b05706e
+Subproject commit eb8f44a1b5ef38d1416a6fdc9a43e8e0215ec0bf
diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
index 538645ce3d..ea360d785a 100644
--- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
+++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h
@@ -40,30 +40,17 @@ namespace nbl::video::cuda_native
 {
 
 /*
-	CUDA SDK view of an SDK-free opaque handle.
+	This header specializes the SDK-free opaque handles from nbl/video/CUDAInteropHandles.h for the CUDA SDK
+	visible to this translation unit. After that opt-in, Nabla interop methods can be called with native CUDA/NVRTC
+	types such as CUdeviceptr, CUexternalSemaphore, nvrtcProgram, CUresult, and nvrtcResult.
 
-	The conversions are intentionally available only after including this header. Public Nabla headers expose
-	only the opaque SCU* values. Once a consumer opts in, the aliases below restore the CUDA spelling and
-	ergonomics for raw Driver API calls without adding accessors to every interop operation. Each alias maps one
-	Nabla opaque handle to the matching CUDA SDK handle and validates size/alignment against the SDK selected by
-	this opt-in translation unit.
-*/
-using SCUdevice = cuda_interop::SNativeHandle<cuda_interop::SCUdevice, CUdevice>;
-using SCUcontext = cuda_interop::SNativeHandle<cuda_interop::SCUcontext, CUcontext>;
-using SCUdeviceptr = cuda_interop::SNativeHandle<cuda_interop::SCUdeviceptr, CUdeviceptr>;
-using SCUexternalMemory = cuda_interop::SNativeHandle<cuda_interop::SCUexternalMemory, CUexternalMemory>;
-using SCUexternalSemaphore = cuda_interop::SNativeHandle<cuda_interop::SCUexternalSemaphore, CUexternalSemaphore>;
-
-/*
-	Check whether this opt-in translation unit uses the exact CUDA SDK version that was used to build Nabla's
-	CUDA interop implementation. Opaque handle layout is checked by SNativeHandle aliases above. This exact
-	version check is a policy helper for SDK-typed code that wants to warn about or reject compatible-but-different
-	SDK headers.
+	The size/alignment checks live in nbl/video/CUDAInteropNativeAPI.h. This exact version check is a policy helper
+	for SDK-typed code that wants to warn about or reject compatible-but-different SDK headers.
 */
 inline bool isBuildCUDASDKVersionExactMatch()
 {
 	const auto buildVersion = CCUDAHandler::getBuildCUDASDKVersion();
-	return buildVersion==0u || buildVersion==CUDA_VERSION;
+	return buildVersion==CUDA_VERSION;
 }
 
 }
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index 578d720546..241f59ea5b 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -7,13 +7,16 @@
 #include "nbl/core/declarations.h"
 #include "nbl/core/definitions.h"
 
+#include "nbl/asset/ICPUBuffer.h"
 #include "nbl/system/declarations.h"
 #include "nbl/system/path.h"
+#include "nbl/video/CUDAInteropHandles.h"
 
 #include <array>
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <type_traits>
 
 namespace nbl::video
 {
@@ -72,6 +75,35 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 		core::SRange<const char* const> getDefaultRuntimeIncludeOptions() const;
 		inline system::logger_opt_ptr getLogger() const { return m_logger.getOptRawPtr(); }
 
+		struct SPTXResult
+		{
+			core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
+			cuda_interop::SNVRTCResult result;
+		};
+
+		static bool defaultHandleResult(cuda_interop::SCUresult result, const system::logger_opt_ptr& logger);
+		bool defaultHandleResult(cuda_interop::SCUresult result) const;
+		bool defaultHandleResult(cuda_interop::SNVRTCResult result) const;
+
+		cuda_interop::SNVRTCResult createProgram(cuda_interop::SNVRTCProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
+		template<typename Program>
+		requires (!std::is_same_v<std::remove_cv_t<Program>,cuda_interop::SNVRTCProgram>)
+		cuda_interop::SNVRTCResult createProgram(Program* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+		{
+			cuda_interop::SNVRTCProgram opaqueProgram = {};
+			const auto result = createProgram(&opaqueProgram,std::move(source),name,headerCount,headerContents,includeNames);
+			if (prog)
+				*prog = static_cast<Program>(opaqueProgram);
+			return result;
+		}
+		cuda_interop::SNVRTCResult compileProgram(cuda_interop::SNVRTCProgram prog, core::SRange<const char* const> options) const;
+		cuda_interop::SNVRTCResult getProgramLog(cuda_interop::SNVRTCProgram prog, std::string& log) const;
+		SPTXResult getPTX(cuda_interop::SNVRTCProgram prog) const;
+		SPTXResult compileDirectlyToPTX(
+			std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+			std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
+		);
+
 		inline core::SRange<system::IFile* const> getSTDHeaders()
 		{
 			auto begin = m_headers.empty() ? nullptr:(&m_headers[0].get());
diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h
index e2c9bb6db6..5cdb1bb3f6 100644
--- a/include/nbl/video/CCUDAImportedMemory.h
+++ b/include/nbl/video/CCUDAImportedMemory.h
@@ -5,6 +5,7 @@
 #include "nbl/video/CUDAInteropHandles.h"
 
 #include <memory>
+#include <type_traits>
 #include <utility>
 
 namespace nbl::video
@@ -19,6 +20,22 @@ class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
 		cuda_interop::SCUexternalMemory getInternalObject() const;
 		bool getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const;
 		bool getMappedBuffer(cuda_interop::SCUdeviceptr& mappedBuffer) const { return getMappedBuffer(&mappedBuffer); }
+		template<typename DevicePtr>
+		requires (!std::is_same_v<std::remove_cv_t<DevicePtr>,cuda_interop::SCUdeviceptr>)
+		bool getMappedBuffer(DevicePtr* mappedBuffer) const
+		{
+			cuda_interop::SCUdeviceptr opaqueMappedBuffer = {};
+			const auto result = getMappedBuffer(&opaqueMappedBuffer);
+			if (result && mappedBuffer)
+				*mappedBuffer = static_cast<DevicePtr>(opaqueMappedBuffer);
+			return result;
+		}
+		template<typename DevicePtr>
+		requires (!std::is_same_v<std::remove_cv_t<DevicePtr>,cuda_interop::SCUdeviceptr>)
+		bool getMappedBuffer(DevicePtr& mappedBuffer) const
+		{
+			return getMappedBuffer(&mappedBuffer);
+		}
 
 	private:
 		friend class CCUDADevice;
diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h
index 987a130ad1..b9e5be244b 100644
--- a/include/nbl/video/CUDAInteropHandles.h
+++ b/include/nbl/video/CUDAInteropHandles.h
@@ -4,8 +4,10 @@
 #ifndef _NBL_VIDEO_CUDA_INTEROP_HANDLES_H_INCLUDED_
 #define _NBL_VIDEO_CUDA_INTEROP_HANDLES_H_INCLUDED_
 
+#include <concepts>
 #include <cstddef>
 #include <cstdint>
+#include <cstring>
 #include <type_traits>
 
 namespace nbl::video::cuda_interop
@@ -19,17 +21,8 @@ namespace nbl::video::cuda_interop
 	not inherit CUDA SDK as a public compile-time dependency. CUDAInteropNative.h maps these opaque handles back
 	to the real CU* types and checks their size/alignment against the SDK selected by the opt-in consumer.
 */
-template<typename Storage>
-struct alignas(alignof(Storage)) SOpaqueCUDAHandle
-{
-	uint8_t value[sizeof(Storage)] = {};
-};
-
-struct SCUdevice : SOpaqueCUDAHandle<int32_t> {};
-struct SCUcontext : SOpaqueCUDAHandle<void*> {};
-struct SCUdeviceptr : SOpaqueCUDAHandle<uintptr_t> {};
-struct SCUexternalMemory : SOpaqueCUDAHandle<void*> {};
-struct SCUexternalSemaphore : SOpaqueCUDAHandle<void*> {};
+template<typename Opaque>
+struct SOpaqueCUDANativeType;
 
 template<typename Opaque, typename Native>
 concept cuda_opaque_handle =
@@ -38,36 +31,90 @@ concept cuda_opaque_handle =
 	sizeof(Opaque)==sizeof(Native) &&
 	alignof(Opaque)==alignof(Native);
 
-/*
-	Native view of an SDK-free opaque handle.
-
-	This template does not depend on CUDA SDK types by itself. CUDAInteropNative.h binds it to concrete CU* types
-	after the consumer opts into CUDA SDK headers. The layout check keeps the public opaque handle and the native
-	SDK handle compatible in that translation unit while preserving Nabla's SDK-free public headers.
-*/
 template<typename Opaque, typename Native>
-struct SNativeHandle
+concept cuda_native_handle_for =
+	requires { typename SOpaqueCUDANativeType<Opaque>::type; } &&
+	std::same_as<std::remove_cv_t<Native>,typename SOpaqueCUDANativeType<Opaque>::type> &&
+	cuda_opaque_handle<Opaque,std::remove_cv_t<Native>>;
+
+template<typename Derived, typename Storage>
+struct alignas(alignof(Storage)) SOpaqueCUDAHandle
 {
-	using cuda_t = Native;
-	static_assert(cuda_opaque_handle<Opaque,cuda_t>);
+	uint8_t value[sizeof(Storage)] = {};
+
+	SOpaqueCUDAHandle() = default;
+
+	template<typename Native>
+	requires cuda_native_handle_for<Derived,Native>
+	SOpaqueCUDAHandle(const Native& native)
+	{
+		operator=(native);
+	}
+
+	template<typename Native>
+	requires cuda_native_handle_for<Derived,Native>
+	Derived& operator=(const Native& native)
+	{
+		std::memcpy(value,&native,sizeof(native));
+		return static_cast<Derived&>(*this);
+	}
 
-	SNativeHandle() = default;
-	SNativeHandle(const SNativeHandle&) = default;
-	SNativeHandle(const cuda_t& native) { operator=(native); }
-	SNativeHandle(const Opaque& opaque) { operator=(opaque); }
+	template<typename Native>
+	requires cuda_native_handle_for<Derived,Native>
+	operator Native() const
+	{
+		Native native = {};
+		std::memcpy(&native,value,sizeof(native));
+		return native;
+	}
 
-	SNativeHandle& operator=(const SNativeHandle&) = default;
-	SNativeHandle& operator=(const cuda_t& native) { value = native; return *this; }
-	SNativeHandle& operator=(const Opaque& opaque) { operator Opaque&() = opaque; return *this; }
+	template<typename Native>
+	requires cuda_native_handle_for<Derived,Native>
+	friend bool operator==(const Derived& lhs, const Native& rhs)
+	{
+		return static_cast<Native>(lhs)==rhs;
+	}
 
-	operator cuda_t&() { return value; }
-	operator const cuda_t&() const { return value; }
-	operator Opaque&() { return reinterpret_cast<Opaque&>(value); }
-	operator const Opaque&() const { return reinterpret_cast<const Opaque&>(value); }
+	template<typename Native>
+	requires cuda_native_handle_for<Derived,Native>
+	friend bool operator==(const Native& lhs, const Derived& rhs)
+	{
+		return lhs==static_cast<Native>(rhs);
+	}
 
-	cuda_t value = {};
+	template<typename Native>
+	requires cuda_native_handle_for<Derived,Native>
+	friend bool operator!=(const Derived& lhs, const Native& rhs)
+	{
+		return !(lhs==rhs);
+	}
+
+	template<typename Native>
+	requires cuda_native_handle_for<Derived,Native>
+	friend bool operator!=(const Native& lhs, const Derived& rhs)
+	{
+		return !(lhs==rhs);
+	}
 };
 
+#define NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(NAME, STORAGE) \
+	struct NAME : SOpaqueCUDAHandle<NAME,STORAGE> \
+	{ \
+		using SOpaqueCUDAHandle<NAME,STORAGE>::SOpaqueCUDAHandle; \
+		using SOpaqueCUDAHandle<NAME,STORAGE>::operator=; \
+	}
+
+NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUdevice, int32_t);
+NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUcontext, void*);
+NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUdeviceptr, uintptr_t);
+NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUexternalMemory, void*);
+NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUexternalSemaphore, void*);
+NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUresult, int32_t);
+NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SNVRTCResult, int32_t);
+NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SNVRTCProgram, void*);
+
+#undef NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE
+
 }
 
 #endif
diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h
index eb75f0eec3..d61ce32b67 100644
--- a/include/nbl/video/CUDAInteropNativeAPI.h
+++ b/include/nbl/video/CUDAInteropNativeAPI.h
@@ -8,12 +8,34 @@
 #include <string>
 
 #include "nbl/video/CUDAInterop.h"
-#include "nbl/asset/ICPUBuffer.h"
 #include "nbl/system/DynamicFunctionCaller.h"
 
 #include "cuda.h"
 #include "nvrtc.h"
 
+namespace nbl::video::cuda_interop
+{
+
+template<> struct SOpaqueCUDANativeType<SCUdevice> { using type = CUdevice; };
+template<> struct SOpaqueCUDANativeType<SCUcontext> { using type = CUcontext; };
+template<> struct SOpaqueCUDANativeType<SCUdeviceptr> { using type = CUdeviceptr; };
+template<> struct SOpaqueCUDANativeType<SCUexternalMemory> { using type = CUexternalMemory; };
+template<> struct SOpaqueCUDANativeType<SCUexternalSemaphore> { using type = CUexternalSemaphore; };
+template<> struct SOpaqueCUDANativeType<SCUresult> { using type = CUresult; };
+template<> struct SOpaqueCUDANativeType<SNVRTCResult> { using type = nvrtcResult; };
+template<> struct SOpaqueCUDANativeType<SNVRTCProgram> { using type = nvrtcProgram; };
+
+static_assert(cuda_opaque_handle<SCUdevice,CUdevice>);
+static_assert(cuda_opaque_handle<SCUcontext,CUcontext>);
+static_assert(cuda_opaque_handle<SCUdeviceptr,CUdeviceptr>);
+static_assert(cuda_opaque_handle<SCUexternalMemory,CUexternalMemory>);
+static_assert(cuda_opaque_handle<SCUexternalSemaphore,CUexternalSemaphore>);
+static_assert(cuda_opaque_handle<SCUresult,CUresult>);
+static_assert(cuda_opaque_handle<SNVRTCResult,nvrtcResult>);
+static_assert(cuda_opaque_handle<SNVRTCProgram,nvrtcProgram>);
+
+}
+
 namespace nbl::video::cuda_native
 {
 
@@ -30,9 +52,8 @@ static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or h
 	do not include it, so normal Nabla consumers do not need cuda.h or nvrtc.h.
 
 	The declarations below intentionally use CUDA/NVRTC SDK types because they describe the SDK-typed glue between
-	raw CUDA code and Nabla's exported CUDA interop objects: dynamic function tables, NVRTC helpers, error handling,
-	and runtime header discovery integration. Consumers enter this surface only by linking Nabla::ext::CUDAInterop
-	and including nbl/ext/CUDAInterop/CUDAInteropNative.h.
+	raw CUDA code and Nabla's exported CUDA interop objects. Consumers enter this surface only by linking
+	Nabla::ext::CUDAInterop and including nbl/ext/CUDAInterop/CUDAInteropNative.h.
 */
 using LibLoader = system::DefaultFuncPtrLoader;
 
@@ -161,36 +182,10 @@ NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader,
 	nvrtcGetProgramLogSize
 );
 
-struct SPTXResult
-{
-	core::smart_refctd_ptr<asset::ICPUBuffer> ptx;
-	nvrtcResult result;
-};
-
-/*
-	Exported Nabla glue declarations with CUDA SDK signatures.
-
-	These are not a CUDA wrapper. They are the small boundary surface used for error handling, NVRTC helpers,
-	runtime header discovery integration, and dynamic CUDA/NVRTC table access. Nabla owns the definitions.
-	The signatures mention CUDA SDK types, so they are intentionally unavailable to consumers that only parse
-	SDK-free nbl/video/CCUDA*.h headers.
-*/
-NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger);
-NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result);
-NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result);
-NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
-NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options);
-NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log);
-NBL_API2 SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog);
-NBL_API2 SPTXResult compileDirectlyToPTX(
-	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
-);
-
 #define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \
 	do { \
 		const auto nblCudaInteropResult = (expr); \
-		if (!::nbl::video::cuda_native::defaultHandleResult((handler),nblCudaInteropResult)) \
+		if (!(handler).defaultHandleResult(nblCudaInteropResult)) \
 			assert(false); \
 	} while (false)
 
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index dff708aff6..55db5cbd24 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -6,8 +6,8 @@
 - The public Nabla headers do not include `cuda.h`, `nvrtc.h`, or other CUDA SDK headers. A consumer that only links `Nabla::Nabla` does not need a CUDA SDK install just to parse Nabla headers.
 - CUDA native state is stored behind incomplete `SNativeState` members in Nabla classes. Public headers expose fixed-layout opaque value handles from `nbl/video/CUDAInteropHandles.h`.
 - `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the SDK opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop.
-- `nbl/video/CUDAInteropNativeAPI.h` is the low-level SDK boundary used by Nabla's CUDA implementation and by opt-in consumers. It declares the dynamic CUDA/NVRTC tables and exported Nabla glue functions whose signatures use CUDA SDK types.
-- `nbl/ext/CUDAInterop/CUDAInteropNative.h` is the public opt-in entrypoint. It includes the native API header and aliases Nabla opaque handles to CUDA SDK types through `cuda_interop::SNativeHandle`.
+- `nbl/video/CUDAInteropNativeAPI.h` is the low-level SDK boundary used by Nabla's CUDA implementation and by opt-in consumers. It declares the dynamic CUDA/NVRTC tables and binds SDK-free opaque handles to CUDA/NVRTC SDK types.
+- `nbl/ext/CUDAInterop/CUDAInteropNative.h` is the public opt-in entrypoint. It includes the native API header so SDK-typed code can use CUDA/NVRTC handles directly with Nabla interop methods.
 
 ## CMake Usage
 
@@ -73,17 +73,14 @@ auto& nvrtc = handler->getNVRTCFunctionTable();
 int driverVersion = 0;
 NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuDriverGetVersion(&driverVersion), *handler);
 
-nbl::video::cuda_native::SCUdeviceptr mapped;
+CUdeviceptr mapped = 0;
 if (importedMemory)
     importedMemory->getMappedBuffer(mapped);
 
-nbl::video::cuda_native::SCUdeviceptr exported = memory->getDeviceptr();
-CUdeviceptr rawMapped = mapped;
-CUdeviceptr rawExported = exported;
+CUdeviceptr exported = memory->getDeviceptr();
 
 std::string log;
-auto compile = nbl::video::cuda_native::compileDirectlyToPTX(
-    *handler,
+auto compile = handler->compileDirectlyToPTX(
     std::move(cudaSource),
     "kernel.cu",
     cudaDevice->geDefaultCompileOptions(),
@@ -102,11 +99,11 @@ if (pcuNewCall)
     pcuNewCall(...);
 ```
 
-- `cuda_interop::SNativeHandle<Opaque, Native>` converts between SDK-free Nabla opaque handles and CUDA SDK handles such as `CUdeviceptr`. The template itself is SDK-free. `CUDAInteropNative.h` only provides CUDA-typed aliases.
+- `cuda_interop::SCU*`, `SCUresult`, `SNVRTCResult`, and `SNVRTCProgram` are SDK-free opaque values in Nabla headers. After including `CUDAInteropNative.h`, they become constructible from and convertible to matching CUDA/NVRTC SDK types such as `CUdeviceptr`, `CUexternalSemaphore`, `CUresult`, `nvrtcResult`, and `nvrtcProgram`.
 - CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI.
-- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. SDK opt-in code can pass `cuda_native::SCUdeviceptr` directly and then use it as `CUdeviceptr`.
-- `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`.
-- `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handlerRef)` is available for tests/examples that intentionally assert on CUDA/NVRTC failures. Pass a `CCUDAHandler&`. Nabla implementation code should still prefer explicit error handling and clean returns.
+- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr` in SDK-free code. SDK opt-in code can pass `CUdeviceptr` directly.
+- `CCUDAHandler::createProgram`, `compileProgram`, `getProgramLog`, `getPTX`, and `compileDirectlyToPTX` are SDK-free Nabla methods. SDK opt-in code can call them with native `nvrtcProgram` / `nvrtcResult` because the opaque conversions are enabled by `CUDAInteropNative.h`.
+- `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handlerRef)` is available for call sites that intentionally assert on CUDA/NVRTC failures. Pass a `CCUDAHandler&`. Nabla implementation code should still prefer explicit error handling and clean returns.
 - `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule.
 
 Smoke examples:
@@ -138,7 +135,7 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud
 - The first include root is not required to match the SDK used to build Nabla. It is the first `-I` path visible to NVRTC, so the first path containing a requested header wins just like normal C/C++ include search.
 - If the primary runtime header root is incomplete or reports a different CUDA version than the loaded NVRTC runtime, Nabla logs a warning. This is diagnostic policy, not an automatic hard failure.
 - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths.
-- `CCUDAHandler` captures discovered include directories when it is created. `cuda_native::compileDirectlyToPTX` reuses those exact include options, so the startup report matches the NVRTC search paths used by that handler.
+- `CCUDAHandler` captures discovered include directories when it is created. `CCUDAHandler::compileDirectlyToPTX` reuses those exact include options, so the startup report matches the NVRTC search paths used by that handler.
 
 Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit.
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index 416b829fb1..c2f9a97ac4 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -35,14 +35,14 @@ using namespace nbl::video;
 	auto importedFromVulkan = cudaDevice.importExternalMemory(std::move(vulkanMemory));
 	auto importedSemaphore = cudaDevice.importExternalSemaphore(std::move(vulkanSemaphore));
 
-	cuda_native::SCUdeviceptr mappedVulkanMemory;
+	CUdeviceptr mappedVulkanMemory = 0;
 	if (importedFromVulkan)
 		importedFromVulkan->getMappedBuffer(mappedVulkanMemory);
 
-	const cuda_native::SCUdeviceptr cudaDevicePtr = cudaMemory->getDeviceptr();
-	cuda_native::SCUexternalSemaphore cudaSemaphore;
+	const CUdeviceptr cudaDevicePtr = cudaMemory->getDeviceptr();
+	CUexternalSemaphore cudaSemaphore = nullptr;
 	if (importedSemaphore)
-		cudaSemaphore = cuda_native::SCUexternalSemaphore(importedSemaphore->getInternalObject());
+		cudaSemaphore = importedSemaphore->getInternalObject();
 	return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore;
 }
 
@@ -100,8 +100,7 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler)
 	)cuda";
 
 	std::string log;
-	auto compile = cuda_native::compileDirectlyToPTX(
-		handler,
+	auto compile = handler.compileDirectlyToPTX(
 		std::string(Source),
 		"cuda_fp16_discovery_probe.cu",
 		{nullptr,nullptr},
@@ -126,7 +125,8 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 		if (!isAPILoaded())
 			return false;
 
-		static_assert(std::is_same_v<nbl::video::cuda_native::SCUdevice::cuda_t, CUdevice>);
+		static_assert(nbl::video::cuda_interop::cuda_opaque_handle<nbl::video::cuda_interop::SCUdevice, CUdevice>);
+		static_assert(nbl::video::cuda_interop::cuda_opaque_handle<nbl::video::cuda_interop::SNVRTCProgram, nvrtcProgram>);
 		[[maybe_unused]] const bool exactBuildSDK = nbl::video::cuda_native::isBuildCUDASDKVersionExactMatch();
 
 		#ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON
diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp
index 29a6562640..25caa0162b 100644
--- a/src/nbl/video/CCUDADevice.cpp
+++ b/src/nbl/video/CCUDADevice.cpp
@@ -88,9 +88,9 @@ CCUDADevice::CCUDADevice(
 
 	const auto& cu = m_handler->getCUDAFunctionTable();
 	
-	if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle)))
+	if (!m_handler->defaultHandleResult(cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle)))
 		return;
-	if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context)))
+	if (!m_handler->defaultHandleResult(cu.pcuCtxSetCurrent(m_native->context)))
 		return;
 
 	for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType)
@@ -109,7 +109,7 @@ CCUDADevice::CCUDADevice(
 			.win32HandleMetaData = &metadata,
 #endif
 		};
-		if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)))
+		if (!m_handler->defaultHandleResult(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)))
 			return;
 	}
 	m_valid = true;
@@ -117,12 +117,12 @@ CCUDADevice::CCUDADevice(
 
 cuda_interop::SCUdevice CCUDADevice::getInternalObject() const
 {
-	return cuda_interop::SNativeHandle<cuda_interop::SCUdevice,CUdevice>(m_native->handle);
+	return m_native->handle;
 }
 
 cuda_interop::SCUcontext CCUDADevice::getContext() const
 {
-	return cuda_interop::SNativeHandle<cuda_interop::SCUcontext,CUcontext>(m_native->context);
+	return m_native->context;
 }
 
 static bool isDeviceLocal(CUmemLocationType location)
@@ -140,7 +140,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDAHandler& handler, CUdevice
 
 	if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err)
 	{
-		cuda_native::defaultHandleResult(handler, cu.pcuMemAddressFree(ptr, size));
+		handler.defaultHandleResult(cu.pcuMemAddressFree(ptr, size));
 		return err;
 	}
 	
@@ -151,8 +151,8 @@ static CUresult reserveAddressAndMapMemory(const CCUDAHandler& handler, CUdevice
 
 	if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err)
 	{
-		cuda_native::defaultHandleResult(handler, cu.pcuMemUnmap(ptr, size));
-		cuda_native::defaultHandleResult(handler, cu.pcuMemAddressFree(ptr, size));
+		handler.defaultHandleResult(cu.pcuMemUnmap(ptr, size));
+		handler.defaultHandleResult(cu.pcuMemAddressFree(ptr, size));
 		return err;
 	}
 
@@ -202,7 +202,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 	if (auto err = cu.pcuMemExportToShareableHandle(&params.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err)
 	{
 		m_logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR);
-		cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem));
+		handler->defaultHandleResult(cu.pcuMemRelease(mem));
 		return nullptr;
 	}
 
@@ -210,7 +210,7 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 	{
 		m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR);
 
-		cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem));
+		handler->defaultHandleResult(cu.pcuMemRelease(mem));
 
 		if (!CloseExternalHandle(params.externalHandle))
 			m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR);
@@ -220,9 +220,9 @@ core::smart_refctd_ptr<CCUDAExportableMemory> CCUDADevice::createExportableMemor
 
 	if (const auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err)
 	{
-		cuda_native::defaultHandleResult(*handler, err);
-		cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(nativeState->ptr, params.granularSize));
-		cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(nativeState->ptr, params.granularSize));
+		handler->defaultHandleResult(err);
+		handler->defaultHandleResult(cu.pcuMemUnmap(nativeState->ptr, params.granularSize));
+		handler->defaultHandleResult(cu.pcuMemAddressFree(nativeState->ptr, params.granularSize));
 		if (!CloseExternalHandle(params.externalHandle))
 			m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR);
 		return nullptr;
@@ -297,7 +297,7 @@ core::smart_refctd_ptr<CCUDAImportedSemaphore> CCUDADevice::importExternalSemaph
 CCUDADevice::~CCUDADevice()
 {
 	if (m_native->context)
-		cuda_native::defaultHandleResult(*m_handler, m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context));
+		m_handler->defaultHandleResult(m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context));
 }
 
 bool CCUDADevice::isValid() const
diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp
index 722c958b68..9333a39f54 100644
--- a/src/nbl/video/CCUDAExportableMemory.cpp
+++ b/src/nbl/video/CCUDAExportableMemory.cpp
@@ -54,9 +54,9 @@ CCUDAExportableMemory::~CCUDAExportableMemory()
 {
 	const auto& cu = m_device->getHandler()->getCUDAFunctionTable();
 
-	cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize));
+	m_device->getHandler()->defaultHandleResult(cu.pcuMemUnmap(m_native->ptr, m_params.granularSize));
 
-	cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize));
+	m_device->getHandler()->defaultHandleResult(cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize));
 
 	if (!CloseExternalHandle(m_params.externalHandle))
 		m_device->getHandler()->getLogger().log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR);
@@ -65,7 +65,7 @@ CCUDAExportableMemory::~CCUDAExportableMemory()
 
 cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const
 {
-	return cuda_interop::SNativeHandle<cuda_interop::SCUdeviceptr,CUdeviceptr>(m_native->ptr);
+	return m_native->ptr;
 }
 
 }
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 9305cf83c0..094046ea6c 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -520,11 +520,9 @@ core::SRange<const char* const> CCUDAHandler::getDefaultRuntimeIncludeOptions()
 	return {begin,begin+m_native->runtimeIncludeOptionPtrs.size()};
 }
 
-namespace cuda_native
-{
-
-bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
+bool CCUDAHandler::defaultHandleResult(cuda_interop::SCUresult opaqueResult, const system::logger_opt_ptr& logger)
 {
+	const CUresult result = opaqueResult;
 	switch (result)
 	{
 		case CUDA_SUCCESS:
@@ -893,26 +891,28 @@ bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger)
 	return false;
 }
 
-bool defaultHandleResult(const CCUDAHandler& handler, CUresult result)
+bool CCUDAHandler::defaultHandleResult(cuda_interop::SCUresult opaqueResult) const
 {
+	const CUresult result = opaqueResult;
 	if (result==CUDA_ERROR_UNSUPPORTED_PTX_VERSION)
 	{
-		const auto cudaVersion = handler.getLoadedCUDADriverVersion();
-		const auto nvrtcVersion = handler.getLoadedNVRTCVersion();
-		handler.getLogger().log(
+		const auto cudaVersion = getLoadedCUDADriverVersion();
+		const auto nvrtcVersion = getLoadedNVRTCVersion();
+		getLogger().log(
 			"CCUDAHandler: CUDA driver API %d.%d rejected PTX produced through NVRTC %d.%d. Install a newer NVIDIA driver or use an NVRTC/runtime-header set compatible with the installed driver.",
 			system::ILogger::ELL_ERROR,
 			cudaVersionMajor(cudaVersion),cudaVersionMinor(cudaVersion),
 			nvrtcVersion[0],nvrtcVersion[1]
 		);
 	}
-	return defaultHandleResult(result,handler.getLogger());
+	return defaultHandleResult(opaqueResult,getLogger());
 }
 
-bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
+bool CCUDAHandler::defaultHandleResult(cuda_interop::SNVRTCResult opaqueResult) const
 {
-	const auto& nvrtc = handler.getNVRTCFunctionTable();
-	const auto logger = handler.getLogger();
+	const nvrtcResult result = opaqueResult;
+	const auto& nvrtc = getNVRTCFunctionTable();
+	const auto logger = getLogger();
 	switch (result)
 	{
 		case NVRTC_SUCCESS:
@@ -928,8 +928,6 @@ bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result)
 	return false;
 }
 
-}
-
 core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* system, core::smart_refctd_ptr<system::ILogger>&& _logger)
 {
 	const system::logger_opt_ptr logger(_logger.get());
@@ -1139,10 +1137,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 	);
 }
 
-namespace cuda_native
-{
-
-nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
+cuda_interop::SNVRTCResult CCUDAHandler::createProgram(cuda_interop::SNVRTCProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
 {
 #if defined(_NBL_WINDOWS_API_)
 	source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n");
@@ -1151,33 +1146,40 @@ nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string
 #else
 #error "Unsuported Platform"
 #endif
-	return handler.getNVRTCFunctionTable().pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames);
+	nvrtcProgram nativeProgram = nullptr;
+	const auto result = getNVRTCFunctionTable().pnvrtcCreateProgram(&nativeProgram,source.c_str(),name,headerCount,headerContents,includeNames);
+	if (prog)
+		*prog = nativeProgram;
+	return result;
 }
 
-nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange<const char* const> options)
+cuda_interop::SNVRTCResult CCUDAHandler::compileProgram(cuda_interop::SNVRTCProgram prog, core::SRange<const char* const> options) const
 {
-	return handler.getNVRTCFunctionTable().pnvrtcCompileProgram(prog,options.size(),options.begin());
+	const nvrtcProgram nativeProgram = prog;
+	return getNVRTCFunctionTable().pnvrtcCompileProgram(nativeProgram,options.size(),options.begin());
 }
 
-nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log)
+cuda_interop::SNVRTCResult CCUDAHandler::getProgramLog(cuda_interop::SNVRTCProgram prog, std::string& log) const
 {
 	size_t _size = 0ull;
-	const auto& nvrtc = handler.getNVRTCFunctionTable();
-	nvrtcResult sizeRes = nvrtc.pnvrtcGetProgramLogSize(prog, &_size);
+	const nvrtcProgram nativeProgram = prog;
+	const auto& nvrtc = getNVRTCFunctionTable();
+	nvrtcResult sizeRes = nvrtc.pnvrtcGetProgramLogSize(nativeProgram, &_size);
 	if (sizeRes != NVRTC_SUCCESS)
 		return sizeRes;
 	if (_size == 0ull)
 		return NVRTC_ERROR_INVALID_INPUT;
 
 	log.resize(_size);
-	return nvrtc.pnvrtcGetProgramLog(prog,log.data());
+	return nvrtc.pnvrtcGetProgramLog(nativeProgram,log.data());
 }
 
-SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
+CCUDAHandler::SPTXResult CCUDAHandler::getPTX(cuda_interop::SNVRTCProgram prog) const
 {
 	size_t _size = 0ull;
-	const auto& nvrtc = handler.getNVRTCFunctionTable();
-	nvrtcResult sizeRes = nvrtc.pnvrtcGetPTXSize(prog,&_size);
+	const nvrtcProgram nativeProgram = prog;
+	const auto& nvrtc = getNVRTCFunctionTable();
+	nvrtcResult sizeRes = nvrtc.pnvrtcGetPTXSize(nativeProgram,&_size);
 	if (sizeRes!=NVRTC_SUCCESS)
 		return {nullptr,sizeRes};
 	if (_size==0ull)
@@ -1187,13 +1189,14 @@ SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog)
 	ptxParams.size = _size;
 	auto ptx = asset::ICPUBuffer::create(std::move(ptxParams));
 	auto ptxPtr = static_cast<char*>(ptx->getPointer());
-	return {std::move(ptx),nvrtc.pnvrtcGetPTX(prog,ptxPtr)};
+	return {std::move(ptx),nvrtc.pnvrtcGetPTX(nativeProgram,ptxPtr)};
 }
 
-static SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange<const char* const> nvrtcOptions, std::string& log)
+static CCUDAHandler::SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, cuda_interop::SNVRTCResult result, cuda_interop::SNVRTCProgram program, core::SRange<const char* const> nvrtcOptions, std::string& log)
 {
 	log.clear();
-	if (result!=NVRTC_SUCCESS)
+	const nvrtcResult nativeResult = result;
+	if (nativeResult!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
 	const auto runtimeIncludeOptions = handler.getDefaultRuntimeIncludeOptions();
@@ -1206,30 +1209,29 @@ static SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult r
 
 	const auto* optionsBegin = options.empty() ? nullptr:options.data();
 	const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size();
-	result = compileProgram(handler,program,{optionsBegin,optionsEnd});
-	getProgramLog(handler,program,log);
-	if (result!=NVRTC_SUCCESS)
+	result = handler.compileProgram(program,{optionsBegin,optionsEnd});
+	handler.getProgramLog(program,log);
+	if (static_cast<nvrtcResult>(result)!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
-	return getPTX(handler,program);
+	return handler.getPTX(program);
 }
 
-SPTXResult compileDirectlyToPTX(
-	CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
+CCUDAHandler::SPTXResult CCUDAHandler::compileDirectlyToPTX(
+	std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
 	std::string& log, const int headerCount, const char* const* headerContents, const char* const* includeNames)
 {
-	nvrtcProgram program = nullptr;
-	nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
+	cuda_interop::SNVRTCProgram program = {};
+	cuda_interop::SNVRTCResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
 	auto cleanup = core::makeRAIIExiter([&]() -> void
 	{
-		if (program)
-			handler.getNVRTCFunctionTable().pnvrtcDestroyProgram(&program);
+		nvrtcProgram nativeProgram = program;
+		if (nativeProgram)
+			getNVRTCFunctionTable().pnvrtcDestroyProgram(&nativeProgram);
 	});
 
-	result = createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames);
-	return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log);
-}
-
+	result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames);
+	return compileDirectlyToPTX_impl(*this,result,program,nvrtcOptions,log);
 }
 
 core::smart_refctd_ptr<CCUDADevice> CCUDAHandler::createDevice(core::smart_refctd_ptr<CVulkanConnection>&& vulkanConnection, IPhysicalDevice* physicalDevice)
diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp
index 54a710e48c..3743790a58 100644
--- a/src/nbl/video/CCUDAImportedMemory.cpp
+++ b/src/nbl/video/CCUDAImportedMemory.cpp
@@ -20,7 +20,7 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr<CCUDADevice> dev
 
 cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const
 {
-	return cuda_interop::SNativeHandle<cuda_interop::SCUexternalMemory,CUexternalMemory>(m_native->handle);
+	return m_native->handle;
 }
 
 bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const
@@ -35,17 +35,17 @@ bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuff
 	CUdeviceptr nativeMappedBuffer = 0;
 	const auto& cu = m_device->getHandler()->getCUDAFunctionTable();
 	const auto result = cu.pcuExternalMemoryGetMappedBuffer(&nativeMappedBuffer, m_native->handle, &bufferDesc);
-	if (!cuda_native::defaultHandleResult(*m_device->getHandler(),result))
+	if (!m_device->getHandler()->defaultHandleResult(result))
 		return false;
 
-	*mappedBuffer = cuda_interop::SNativeHandle<cuda_interop::SCUdeviceptr,CUdeviceptr>(nativeMappedBuffer);
+	*mappedBuffer = nativeMappedBuffer;
 	return true;
 }
 
 CCUDAImportedMemory::~CCUDAImportedMemory()
 {
 	auto& cu = m_device->getHandler()->getCUDAFunctionTable();
-	cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle));
+	m_device->getHandler()->defaultHandleResult(cu.pcuDestroyExternalMemory(m_native->handle));
 }
 
 }
diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp
index 1afd4a10b1..49495e11e2 100644
--- a/src/nbl/video/CCUDAImportedSemaphore.cpp
+++ b/src/nbl/video/CCUDAImportedSemaphore.cpp
@@ -19,13 +19,13 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevic
 
 cuda_interop::SCUexternalSemaphore CCUDAImportedSemaphore::getInternalObject() const
 {
-	return cuda_interop::SNativeHandle<cuda_interop::SCUexternalSemaphore,CUexternalSemaphore>(m_native->handle);
+	return m_native->handle;
 }
 
 CCUDAImportedSemaphore::~CCUDAImportedSemaphore()
 {
 	auto& cu = m_device->getHandler()->getCUDAFunctionTable();
-	cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle));
+	m_device->getHandler()->defaultHandleResult(cu.pcuDestroyExternalSemaphore(m_native->handle));
 }
 }
 

From 21d3b7ce6761ddd4744f310a3412d03a731f3e2e Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sun, 10 May 2026 19:57:28 +0200
Subject: [PATCH 36/51] Accept CUDA handler pointers in assert helper

---
 examples_tests                           | 2 +-
 include/nbl/video/CUDAInteropNativeAPI.h | 2 +-
 src/nbl/ext/CUDAInterop/README.md        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples_tests b/examples_tests
index eb8f44a1b5..39441760d3 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit eb8f44a1b5ef38d1416a6fdc9a43e8e0215ec0bf
+Subproject commit 39441760d335467158a340ad366302235ba6c30e
diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h
index d61ce32b67..e3cf7c6f78 100644
--- a/include/nbl/video/CUDAInteropNativeAPI.h
+++ b/include/nbl/video/CUDAInteropNativeAPI.h
@@ -185,7 +185,7 @@ NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader,
 #define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \
 	do { \
 		const auto nblCudaInteropResult = (expr); \
-		if (!(handler).defaultHandleResult(nblCudaInteropResult)) \
+		if (!(handler)->defaultHandleResult(nblCudaInteropResult)) \
 			assert(false); \
 	} while (false)
 
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 55db5cbd24..f5049a775a 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -71,7 +71,7 @@ auto memory = cudaDevice->createExportableMemory({
 auto& cu = handler->getCUDAFunctionTable();
 auto& nvrtc = handler->getNVRTCFunctionTable();
 int driverVersion = 0;
-NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuDriverGetVersion(&driverVersion), *handler);
+NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuDriverGetVersion(&driverVersion), handler);
 
 CUdeviceptr mapped = 0;
 if (importedMemory)
@@ -103,7 +103,7 @@ if (pcuNewCall)
 - CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI.
 - `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr` in SDK-free code. SDK opt-in code can pass `CUdeviceptr` directly.
 - `CCUDAHandler::createProgram`, `compileProgram`, `getProgramLog`, `getPTX`, and `compileDirectlyToPTX` are SDK-free Nabla methods. SDK opt-in code can call them with native `nvrtcProgram` / `nvrtcResult` because the opaque conversions are enabled by `CUDAInteropNative.h`.
-- `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handlerRef)` is available for call sites that intentionally assert on CUDA/NVRTC failures. Pass a `CCUDAHandler&`. Nabla implementation code should still prefer explicit error handling and clean returns.
+- `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler)` is available for call sites that intentionally assert on CUDA/NVRTC failures. Pass a pointer-like `CCUDAHandler` handle. Nabla implementation code should still prefer explicit error handling and clean returns.
 - `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule.
 
 Smoke examples:

From dfca17ee1dc891ecb71451bd98c44f7a8dd9b8e8 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sun, 10 May 2026 20:04:14 +0200
Subject: [PATCH 37/51] Consolidate CUDA native handle declarations

---
 include/nbl/video/CUDAInteropNativeAPI.h | 31 +++++++++++-------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h
index e3cf7c6f78..6084d4a00c 100644
--- a/include/nbl/video/CUDAInteropNativeAPI.h
+++ b/include/nbl/video/CUDAInteropNativeAPI.h
@@ -16,23 +16,20 @@
 namespace nbl::video::cuda_interop
 {
 
-template<> struct SOpaqueCUDANativeType<SCUdevice> { using type = CUdevice; };
-template<> struct SOpaqueCUDANativeType<SCUcontext> { using type = CUcontext; };
-template<> struct SOpaqueCUDANativeType<SCUdeviceptr> { using type = CUdeviceptr; };
-template<> struct SOpaqueCUDANativeType<SCUexternalMemory> { using type = CUexternalMemory; };
-template<> struct SOpaqueCUDANativeType<SCUexternalSemaphore> { using type = CUexternalSemaphore; };
-template<> struct SOpaqueCUDANativeType<SCUresult> { using type = CUresult; };
-template<> struct SOpaqueCUDANativeType<SNVRTCResult> { using type = nvrtcResult; };
-template<> struct SOpaqueCUDANativeType<SNVRTCProgram> { using type = nvrtcProgram; };
-
-static_assert(cuda_opaque_handle<SCUdevice,CUdevice>);
-static_assert(cuda_opaque_handle<SCUcontext,CUcontext>);
-static_assert(cuda_opaque_handle<SCUdeviceptr,CUdeviceptr>);
-static_assert(cuda_opaque_handle<SCUexternalMemory,CUexternalMemory>);
-static_assert(cuda_opaque_handle<SCUexternalSemaphore,CUexternalSemaphore>);
-static_assert(cuda_opaque_handle<SCUresult,CUresult>);
-static_assert(cuda_opaque_handle<SNVRTCResult,nvrtcResult>);
-static_assert(cuda_opaque_handle<SNVRTCProgram,nvrtcProgram>);
+#define NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(OPAQUE, NATIVE) \
+	template<> struct SOpaqueCUDANativeType<OPAQUE> { using type = NATIVE; }; \
+	static_assert(cuda_opaque_handle<OPAQUE,NATIVE>)
+
+NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUdevice, CUdevice);
+NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUcontext, CUcontext);
+NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUdeviceptr, CUdeviceptr);
+NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUexternalMemory, CUexternalMemory);
+NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUexternalSemaphore, CUexternalSemaphore);
+NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUresult, CUresult);
+NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SNVRTCResult, nvrtcResult);
+NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SNVRTCProgram, nvrtcProgram);
+
+#undef NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE
 
 }
 

From 525315eeb9d3e614d565e0fba3b32382336989ac Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sun, 10 May 2026 23:44:30 +0200
Subject: [PATCH 38/51] Tighten CUDA native output bridges

---
 include/nbl/video/CCUDAHandler.h        | 13 ++----
 include/nbl/video/CCUDAImportedMemory.h | 20 ++-------
 include/nbl/video/CUDAInteropHandles.h  | 60 +++++++++++++++----------
 src/nbl/ext/CUDAInterop/README.md       |  4 +-
 4 files changed, 46 insertions(+), 51 deletions(-)

diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index 241f59ea5b..9975a7e212 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -16,7 +16,7 @@
 #include <cstdint>
 #include <memory>
 #include <string>
-#include <type_traits>
+#include <utility>
 
 namespace nbl::video
 {
@@ -86,15 +86,10 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 		bool defaultHandleResult(cuda_interop::SNVRTCResult result) const;
 
 		cuda_interop::SNVRTCResult createProgram(cuda_interop::SNVRTCProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
-		template<typename Program>
-		requires (!std::is_same_v<std::remove_cv_t<Program>,cuda_interop::SNVRTCProgram>)
-		cuda_interop::SNVRTCResult createProgram(Program* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
+		NBL_CUDA_INTEROP_NATIVE_FOR(Program, cuda_interop::SNVRTCProgram)
+		inline cuda_interop::SNVRTCResult createProgram(Program* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
 		{
-			cuda_interop::SNVRTCProgram opaqueProgram = {};
-			const auto result = createProgram(&opaqueProgram,std::move(source),name,headerCount,headerContents,includeNames);
-			if (prog)
-				*prog = static_cast<Program>(opaqueProgram);
-			return result;
+			return createProgram(cuda_interop::asOpaqueOutput<cuda_interop::SNVRTCProgram>(prog),std::move(source),name,headerCount,headerContents,includeNames);
 		}
 		cuda_interop::SNVRTCResult compileProgram(cuda_interop::SNVRTCProgram prog, core::SRange<const char* const> options) const;
 		cuda_interop::SNVRTCResult getProgramLog(cuda_interop::SNVRTCProgram prog, std::string& log) const;
diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h
index 5cdb1bb3f6..720ae30b3d 100644
--- a/include/nbl/video/CCUDAImportedMemory.h
+++ b/include/nbl/video/CCUDAImportedMemory.h
@@ -5,8 +5,6 @@
 #include "nbl/video/CUDAInteropHandles.h"
 
 #include <memory>
-#include <type_traits>
-#include <utility>
 
 namespace nbl::video
 {
@@ -19,22 +17,10 @@ class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
 		~CCUDAImportedMemory() override;
 		cuda_interop::SCUexternalMemory getInternalObject() const;
 		bool getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const;
-		bool getMappedBuffer(cuda_interop::SCUdeviceptr& mappedBuffer) const { return getMappedBuffer(&mappedBuffer); }
-		template<typename DevicePtr>
-		requires (!std::is_same_v<std::remove_cv_t<DevicePtr>,cuda_interop::SCUdeviceptr>)
-		bool getMappedBuffer(DevicePtr* mappedBuffer) const
+		NBL_CUDA_INTEROP_NATIVE_FOR(DevicePtr, cuda_interop::SCUdeviceptr)
+		inline bool getMappedBuffer(DevicePtr& mappedBuffer) const
 		{
-			cuda_interop::SCUdeviceptr opaqueMappedBuffer = {};
-			const auto result = getMappedBuffer(&opaqueMappedBuffer);
-			if (result && mappedBuffer)
-				*mappedBuffer = static_cast<DevicePtr>(opaqueMappedBuffer);
-			return result;
-		}
-		template<typename DevicePtr>
-		requires (!std::is_same_v<std::remove_cv_t<DevicePtr>,cuda_interop::SCUdeviceptr>)
-		bool getMappedBuffer(DevicePtr& mappedBuffer) const
-		{
-			return getMappedBuffer(&mappedBuffer);
+			return getMappedBuffer(cuda_interop::asOpaqueOutput<cuda_interop::SCUdeviceptr>(mappedBuffer));
 		}
 
 	private:
diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h
index b9e5be244b..3b555b599f 100644
--- a/include/nbl/video/CUDAInteropHandles.h
+++ b/include/nbl/video/CUDAInteropHandles.h
@@ -7,7 +7,6 @@
 #include <concepts>
 #include <cstddef>
 #include <cstdint>
-#include <cstring>
 #include <type_traits>
 
 namespace nbl::video::cuda_interop
@@ -34,8 +33,32 @@ concept cuda_opaque_handle =
 template<typename Opaque, typename Native>
 concept cuda_native_handle_for =
 	requires { typename SOpaqueCUDANativeType<Opaque>::type; } &&
-	std::same_as<std::remove_cv_t<Native>,typename SOpaqueCUDANativeType<Opaque>::type> &&
-	cuda_opaque_handle<Opaque,std::remove_cv_t<Native>>;
+	std::same_as<std::remove_cvref_t<Native>,typename SOpaqueCUDANativeType<Opaque>::type> &&
+	cuda_opaque_handle<Opaque,std::remove_cvref_t<Native>>;
+
+template<typename Opaque, typename Native>
+requires cuda_native_handle_for<Opaque,Native>
+Opaque* asOpaqueOutput(Native* native)
+{
+	return reinterpret_cast<Opaque*>(native);
+}
+
+template<typename Opaque, typename Native>
+requires cuda_native_handle_for<Opaque,Native>
+Opaque* asOpaqueOutput(Native& native)
+{
+	return asOpaqueOutput<Opaque>(&native);
+}
+
+/*
+	Declare a narrow native-reference bridge for SDK opt-in code. Value conversions make SCU* handles usable as
+	native CUDA handles after CUDAInteropNative.h is included, but output parameters still need a writable object
+	whose storage matches the opaque handle. Use asOpaqueOutput inside such bridge overloads. This macro keeps
+	them short and constrained to the exact SDK type validated for the opaque handle.
+*/
+#define NBL_CUDA_INTEROP_NATIVE_FOR(TYPE, OPAQUE) \
+	template<typename TYPE> \
+	requires ::nbl::video::cuda_interop::cuda_native_handle_for<OPAQUE,TYPE>
 
 template<typename Derived, typename Storage>
 struct alignas(alignof(Storage)) SOpaqueCUDAHandle
@@ -53,47 +76,38 @@ struct alignas(alignof(Storage)) SOpaqueCUDAHandle
 
 	template<typename Native>
 	requires cuda_native_handle_for<Derived,Native>
-	Derived& operator=(const Native& native)
-	{
-		std::memcpy(value,&native,sizeof(native));
-		return static_cast<Derived&>(*this);
-	}
-
-	template<typename Native>
-	requires cuda_native_handle_for<Derived,Native>
-	operator Native() const
+	operator Native&()
 	{
-		Native native = {};
-		std::memcpy(&native,value,sizeof(native));
-		return native;
+		return *reinterpret_cast<Native*>(value);
 	}
 
 	template<typename Native>
 	requires cuda_native_handle_for<Derived,Native>
-	friend bool operator==(const Derived& lhs, const Native& rhs)
+	operator const Native&() const
 	{
-		return static_cast<Native>(lhs)==rhs;
+		return *reinterpret_cast<const Native*>(value);
 	}
 
 	template<typename Native>
 	requires cuda_native_handle_for<Derived,Native>
-	friend bool operator==(const Native& lhs, const Derived& rhs)
+	Derived& operator=(const Native& native)
 	{
-		return lhs==static_cast<Native>(rhs);
+		static_cast<Native&>(*this) = native;
+		return static_cast<Derived&>(*this);
 	}
 
 	template<typename Native>
 	requires cuda_native_handle_for<Derived,Native>
-	friend bool operator!=(const Derived& lhs, const Native& rhs)
+	friend bool operator==(const Derived& lhs, const Native& rhs)
 	{
-		return !(lhs==rhs);
+		return static_cast<const Native&>(lhs)==rhs;
 	}
 
 	template<typename Native>
 	requires cuda_native_handle_for<Derived,Native>
-	friend bool operator!=(const Native& lhs, const Derived& rhs)
+	friend bool operator==(const Native& lhs, const Derived& rhs)
 	{
-		return !(lhs==rhs);
+		return lhs==static_cast<const Native&>(rhs);
 	}
 };
 
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index f5049a775a..1ebeb79a48 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -101,8 +101,8 @@ if (pcuNewCall)
 
 - `cuda_interop::SCU*`, `SCUresult`, `SNVRTCResult`, and `SNVRTCProgram` are SDK-free opaque values in Nabla headers. After including `CUDAInteropNative.h`, they become constructible from and convertible to matching CUDA/NVRTC SDK types such as `CUdeviceptr`, `CUexternalSemaphore`, `CUresult`, `nvrtcResult`, and `nvrtcProgram`.
 - CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI.
-- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr` in SDK-free code. SDK opt-in code can pass `CUdeviceptr` directly.
-- `CCUDAHandler::createProgram`, `compileProgram`, `getProgramLog`, `getPTX`, and `compileDirectlyToPTX` are SDK-free Nabla methods. SDK opt-in code can call them with native `nvrtcProgram` / `nvrtcResult` because the opaque conversions are enabled by `CUDAInteropNative.h`.
+- SDK-free output parameters stay pointer-based. SDK opt-in code can pass native CUDA output variables directly through small inline bridge overloads.
+- `CCUDAHandler::compileProgram`, `getProgramLog`, `getPTX`, and `compileDirectlyToPTX` are SDK-free Nabla methods. SDK opt-in code can use their results with native `nvrtcProgram` / `nvrtcResult` because the opaque conversions are enabled by `CUDAInteropNative.h`.
 - `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler)` is available for call sites that intentionally assert on CUDA/NVRTC failures. Pass a pointer-like `CCUDAHandler` handle. Nabla implementation code should still prefer explicit error handling and clean returns.
 - `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule.
 

From d8d4c3b8d6e2e79ab49bb56c8b6e69b04de02624 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 11 May 2026 06:37:07 +0200
Subject: [PATCH 39/51] Centralize CUDA output bridge

---
 include/nbl/video/CCUDAHandler.h              |  8 +--
 include/nbl/video/CCUDAImportedMemory.h       |  7 +--
 include/nbl/video/CUDAInteropHandles.h        | 57 ++++++++++++-------
 src/nbl/ext/CUDAInterop/README.md             |  5 +-
 .../ext/CUDAInterop/smoke/native_opt_in.cpp   | 17 ++++++
 src/nbl/video/CCUDAHandler.cpp                |  4 +-
 src/nbl/video/CCUDAImportedMemory.cpp         |  4 +-
 7 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index 9975a7e212..9af65ff25b 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -16,7 +16,6 @@
 #include <cstdint>
 #include <memory>
 #include <string>
-#include <utility>
 
 namespace nbl::video
 {
@@ -85,12 +84,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 		bool defaultHandleResult(cuda_interop::SCUresult result) const;
 		bool defaultHandleResult(cuda_interop::SNVRTCResult result) const;
 
-		cuda_interop::SNVRTCResult createProgram(cuda_interop::SNVRTCProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
-		NBL_CUDA_INTEROP_NATIVE_FOR(Program, cuda_interop::SNVRTCProgram)
-		inline cuda_interop::SNVRTCResult createProgram(Program* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr)
-		{
-			return createProgram(cuda_interop::asOpaqueOutput<cuda_interop::SNVRTCProgram>(prog),std::move(source),name,headerCount,headerContents,includeNames);
-		}
+		cuda_interop::SNVRTCResult createProgram(cuda_interop::SOutput<cuda_interop::SNVRTCProgram> prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr);
 		cuda_interop::SNVRTCResult compileProgram(cuda_interop::SNVRTCProgram prog, core::SRange<const char* const> options) const;
 		cuda_interop::SNVRTCResult getProgramLog(cuda_interop::SNVRTCProgram prog, std::string& log) const;
 		SPTXResult getPTX(cuda_interop::SNVRTCProgram prog) const;
diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h
index 720ae30b3d..0266706480 100644
--- a/include/nbl/video/CCUDAImportedMemory.h
+++ b/include/nbl/video/CCUDAImportedMemory.h
@@ -16,12 +16,7 @@ class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted
 	public:
 		~CCUDAImportedMemory() override;
 		cuda_interop::SCUexternalMemory getInternalObject() const;
-		bool getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const;
-		NBL_CUDA_INTEROP_NATIVE_FOR(DevicePtr, cuda_interop::SCUdeviceptr)
-		inline bool getMappedBuffer(DevicePtr& mappedBuffer) const
-		{
-			return getMappedBuffer(cuda_interop::asOpaqueOutput<cuda_interop::SCUdeviceptr>(mappedBuffer));
-		}
+		bool getMappedBuffer(cuda_interop::SOutput<cuda_interop::SCUdeviceptr> mappedBuffer) const;
 
 	private:
 		friend class CCUDADevice;
diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h
index 3b555b599f..c0002f5cc9 100644
--- a/include/nbl/video/CUDAInteropHandles.h
+++ b/include/nbl/video/CUDAInteropHandles.h
@@ -36,29 +36,46 @@ concept cuda_native_handle_for =
 	std::same_as<std::remove_cvref_t<Native>,typename SOpaqueCUDANativeType<Opaque>::type> &&
 	cuda_opaque_handle<Opaque,std::remove_cvref_t<Native>>;
 
-template<typename Opaque, typename Native>
-requires cuda_native_handle_for<Opaque,Native>
-Opaque* asOpaqueOutput(Native* native)
-{
-	return reinterpret_cast<Opaque*>(native);
-}
+/*
+	Output bridge for SDK-free APIs that write CUDA/NVRTC handles.
 
-template<typename Opaque, typename Native>
-requires cuda_native_handle_for<Opaque,Native>
-Opaque* asOpaqueOutput(Native& native)
-{
-	return asOpaqueOutput<Opaque>(&native);
-}
+	Value conversions in SOpaqueCUDAHandle are enough for inputs and return values, but C++ does not apply those
+	user-defined conversions through output pointers or mutable output references. This type centralizes that one
+	boundary case. Without it, every Nabla method that writes a native CUDA/NVRTC handle would need a separate
+	SDK-typed overload, or SDK opt-in callers would have to spell the SDK-free SCU* type manually. With SOutput,
+	Nabla methods keep one SDK-free signature while SDK opt-in callers still use raw CUDA spelling:
 
-/*
-	Declare a narrow native-reference bridge for SDK opt-in code. Value conversions make SCU* handles usable as
-	native CUDA handles after CUDAInteropNative.h is included, but output parameters still need a writable object
-	whose storage matches the opaque handle. Use asOpaqueOutput inside such bridge overloads. This macro keeps
-	them short and constrained to the exact SDK type validated for the opaque handle.
+	    CUdeviceptr ptr = 0;
+	    importedMemory->getMappedBuffer(ptr);
+	    nvrtcProgram program = nullptr;
+	    handler->createProgram(program,std::move(source),"kernel.cu");
+
+	SDK-free callers can pass SCU* objects or SCU* pointers. SDK opt-in callers can pass the matching native
+	CUDA/NVRTC object or pointer after CUDAInteropNative.h specializes SOpaqueCUDANativeType for the selected SDK.
 */
-#define NBL_CUDA_INTEROP_NATIVE_FOR(TYPE, OPAQUE) \
-	template<typename TYPE> \
-	requires ::nbl::video::cuda_interop::cuda_native_handle_for<OPAQUE,TYPE>
+template<typename Opaque>
+struct SOutput
+{
+	SOutput(std::nullptr_t) : ptr(nullptr) {}
+	SOutput(Opaque& opaque) : ptr(&opaque) {}
+	SOutput(Opaque* opaque) : ptr(opaque) {}
+
+	template<typename Native>
+	requires cuda_native_handle_for<Opaque,Native>
+	SOutput(Native& native) : ptr(reinterpret_cast<Opaque*>(&native)) {}
+
+	template<typename Native>
+	requires cuda_native_handle_for<Opaque,Native>
+	SOutput(Native* native) : ptr(reinterpret_cast<Opaque*>(native)) {}
+
+	Opaque* get() const { return ptr; }
+	Opaque& operator*() const { return *ptr; }
+	operator Opaque*() const { return ptr; }
+	explicit operator bool() const { return ptr!=nullptr; }
+
+	private:
+		Opaque* ptr;
+};
 
 template<typename Derived, typename Storage>
 struct alignas(alignof(Storage)) SOpaqueCUDAHandle
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index 1ebeb79a48..b764dcea93 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -79,6 +79,9 @@ if (importedMemory)
 
 CUdeviceptr exported = memory->getDeviceptr();
 
+nvrtcProgram program = nullptr;
+auto createResult = handler->createProgram(program, std::string(cudaSource), "kernel.cu");
+
 std::string log;
 auto compile = handler->compileDirectlyToPTX(
     std::move(cudaSource),
@@ -101,7 +104,7 @@ if (pcuNewCall)
 
 - `cuda_interop::SCU*`, `SCUresult`, `SNVRTCResult`, and `SNVRTCProgram` are SDK-free opaque values in Nabla headers. After including `CUDAInteropNative.h`, they become constructible from and convertible to matching CUDA/NVRTC SDK types such as `CUdeviceptr`, `CUexternalSemaphore`, `CUresult`, `nvrtcResult`, and `nvrtcProgram`.
 - CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI.
-- SDK-free output parameters stay pointer-based. SDK opt-in code can pass native CUDA output variables directly through small inline bridge overloads.
+- SDK-free output parameters use `cuda_interop::SOutput<...>`. SDK-free code can pass opaque `SCU*` values or pointers. SDK opt-in code can pass matching native CUDA/NVRTC output variables directly, for example `CUdeviceptr mapped; importedMemory->getMappedBuffer(mapped);` or `nvrtcProgram program; handler->createProgram(program, ...)`.
 - `CCUDAHandler::compileProgram`, `getProgramLog`, `getPTX`, and `compileDirectlyToPTX` are SDK-free Nabla methods. SDK opt-in code can use their results with native `nvrtcProgram` / `nvrtcResult` because the opaque conversions are enabled by `CUDAInteropNative.h`.
 - `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler)` is available for call sites that intentionally assert on CUDA/NVRTC failures. Pass a pointer-like `CCUDAHandler` handle. Nabla implementation code should still prefer explicit error handling and clean returns.
 - `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule.
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index c2f9a97ac4..71f2d3e7b9 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -5,6 +5,7 @@
 #include <array>
 #include <cstdint>
 #include <filesystem>
+#include <string>
 #include <type_traits>
 #include <utility>
 
@@ -111,6 +112,19 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler)
 	);
 	return compile.result==NVRTC_SUCCESS && compile.ptx && compile.ptx->getSize()>0u;
 }
+
+bool nativeNVRTCOutputProbe(CCUDAHandler& handler)
+{
+	constexpr const char* Source = R"cuda(
+		extern "C" __global__ void native_output_probe() {}
+	)cuda";
+
+	nvrtcProgram program = nullptr;
+	const auto result = handler.createProgram(program,std::string(Source),"native_output_probe.cu");
+	if (program)
+		handler.getNVRTCFunctionTable().pnvrtcDestroyProgram(&program);
+	return result==NVRTC_SUCCESS;
+}
 }
 
 class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramework
@@ -154,6 +168,9 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew
 		if (!pcuDriverGetVersion || pcuDriverGetVersion(&loadedDriverVersion)!=CUDA_SUCCESS || loadedDriverVersion==0)
 			return false;
 
+		if (!nativeNVRTCOutputProbe(*handler))
+			return false;
+
 		if (!cudaFp16HeaderCompileProbe(*handler))
 			return false;
 
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index 094046ea6c..f81e6e6ade 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -1137,7 +1137,7 @@ core::smart_refctd_ptr<CCUDAHandler> CCUDAHandler::create(system::ISystem* syste
 	);
 }
 
-cuda_interop::SNVRTCResult CCUDAHandler::createProgram(cuda_interop::SNVRTCProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
+cuda_interop::SNVRTCResult CCUDAHandler::createProgram(cuda_interop::SOutput<cuda_interop::SNVRTCProgram> prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames)
 {
 #if defined(_NBL_WINDOWS_API_)
 	source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n");
@@ -1230,7 +1230,7 @@ CCUDAHandler::SPTXResult CCUDAHandler::compileDirectlyToPTX(
 			getNVRTCFunctionTable().pnvrtcDestroyProgram(&nativeProgram);
 	});
 
-	result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames);
+	result = createProgram(program,std::move(source),filename,headerCount,headerContents,includeNames);
 	return compileDirectlyToPTX_impl(*this,result,program,nvrtcOptions,log);
 }
 
diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp
index 3743790a58..ec5438643f 100644
--- a/src/nbl/video/CCUDAImportedMemory.cpp
+++ b/src/nbl/video/CCUDAImportedMemory.cpp
@@ -23,7 +23,7 @@ cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const
 	return m_native->handle;
 }
 
-bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const
+bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SOutput<cuda_interop::SCUdeviceptr> mappedBuffer) const
 {
 	if (!mappedBuffer)
 		return false;
@@ -73,7 +73,7 @@ cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const
 	return {};
 }
 
-bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr*) const
+bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SOutput<cuda_interop::SCUdeviceptr>) const
 {
 	return false;
 }

From fe3fd663ab41c63b64c07acd786566e30e753ded Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 11 May 2026 08:40:13 +0200
Subject: [PATCH 40/51] Document CUDA interop handles

---
 include/nbl/video/CUDAInteropHandles.h | 37 +++++++++++---------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h
index c0002f5cc9..a7664310aa 100644
--- a/include/nbl/video/CUDAInteropHandles.h
+++ b/include/nbl/video/CUDAInteropHandles.h
@@ -13,12 +13,17 @@ namespace nbl::video::cuda_interop
 {
 
 /*
-	SDK-free CUDA handle surrogates used by Nabla's public video API.
+	SDK-free CUDA interop boundary.
 
-	These types are the small glue layer between Nabla and SDK-typed CUDA interop code. They let nbl/video/CCUDA*.h
-	expose CUDA-related objects without including cuda.h or nvrtc.h, so consumers that only link Nabla::Nabla do
-	not inherit CUDA SDK as a public compile-time dependency. CUDAInteropNative.h maps these opaque handles back
-	to the real CU* types and checks their size/alignment against the SDK selected by the opt-in consumer.
+	Public nbl/video/CCUDA*.h headers cannot include cuda.h or nvrtc.h, but they still need to carry CUDA interop
+	state and write CUDA/NVRTC handles for opt-in users. The split below keeps those two roles explicit:
+	- SOpaqueCUDAHandle owns handle bits and is used in Nabla object layout, parameters, and return values.
+	- SOutput is a non-owning output adapter. C++ does not apply user-defined conversions through T* or mutable T&,
+	  so output parameters need a small bridge to write directly into either SCU* storage or native SDK storage.
+
+	CUDAInteropNative.h is the only header that maps these opaque types back to CUDA/NVRTC SDK types. These helpers
+	are class templates with in-class member definitions, so they are inline by the language rules and add no exported
+	symbols.
 */
 template<typename Opaque>
 struct SOpaqueCUDANativeType;
@@ -37,21 +42,8 @@ concept cuda_native_handle_for =
 	cuda_opaque_handle<Opaque,std::remove_cvref_t<Native>>;
 
 /*
-	Output bridge for SDK-free APIs that write CUDA/NVRTC handles.
-
-	Value conversions in SOpaqueCUDAHandle are enough for inputs and return values, but C++ does not apply those
-	user-defined conversions through output pointers or mutable output references. This type centralizes that one
-	boundary case. Without it, every Nabla method that writes a native CUDA/NVRTC handle would need a separate
-	SDK-typed overload, or SDK opt-in callers would have to spell the SDK-free SCU* type manually. With SOutput,
-	Nabla methods keep one SDK-free signature while SDK opt-in callers still use raw CUDA spelling:
-
-	    CUdeviceptr ptr = 0;
-	    importedMemory->getMappedBuffer(ptr);
-	    nvrtcProgram program = nullptr;
-	    handler->createProgram(program,std::move(source),"kernel.cu");
-
-	SDK-free callers can pass SCU* objects or SCU* pointers. SDK opt-in callers can pass the matching native
-	CUDA/NVRTC object or pointer after CUDAInteropNative.h specializes SOpaqueCUDANativeType for the selected SDK.
+	Non-owning output bridge for SDK-free APIs. It keeps one Nabla signature while opt-in callers can pass raw
+	CUDA/NVRTC output variables directly, e.g. `CUdeviceptr ptr; memory->getMappedBuffer(ptr);`.
 */
 template<typename Opaque>
 struct SOutput
@@ -68,7 +60,6 @@ struct SOutput
 	requires cuda_native_handle_for<Opaque,Native>
 	SOutput(Native* native) : ptr(reinterpret_cast<Opaque*>(native)) {}
 
-	Opaque* get() const { return ptr; }
 	Opaque& operator*() const { return *ptr; }
 	operator Opaque*() const { return ptr; }
 	explicit operator bool() const { return ptr!=nullptr; }
@@ -77,6 +68,10 @@ struct SOutput
 		Opaque* ptr;
 };
 
+/*
+	Owned opaque value used in public Nabla ABI. Native reference conversions become available only after the opt-in
+	header specializes SOpaqueCUDANativeType for the selected CUDA SDK.
+*/
 template<typename Derived, typename Storage>
 struct alignas(alignof(Storage)) SOpaqueCUDAHandle
 {

From d5dfadefad18f18e657d96fc06b771568a161bbf Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 11 May 2026 09:27:44 +0200
Subject: [PATCH 41/51] Make CUDA PTX compile log optional

---
 examples_tests                                  |  2 +-
 include/nbl/video/CCUDAHandler.h                |  2 +-
 src/nbl/ext/CUDAInterop/README.md               |  2 +-
 src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp |  2 +-
 src/nbl/video/CCUDAHandler.cpp                  | 10 ++++++----
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/examples_tests b/examples_tests
index 39441760d3..b4a8725d54 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 39441760d335467158a340ad366302235ba6c30e
+Subproject commit b4a8725d54ca960e0d2c353ef08d5f40aa4c4e04
diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h
index 9af65ff25b..4d2324cfa6 100644
--- a/include/nbl/video/CCUDAHandler.h
+++ b/include/nbl/video/CCUDAHandler.h
@@ -90,7 +90,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted
 		SPTXResult getPTX(cuda_interop::SNVRTCProgram prog) const;
 		SPTXResult compileDirectlyToPTX(
 			std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-			std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
+			std::string* log=nullptr, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr
 		);
 
 		inline core::SRange<system::IFile* const> getSTDHeaders()
diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md
index b764dcea93..0d8ebe2f08 100644
--- a/src/nbl/ext/CUDAInterop/README.md
+++ b/src/nbl/ext/CUDAInterop/README.md
@@ -87,7 +87,7 @@ auto compile = handler->compileDirectlyToPTX(
     std::move(cudaSource),
     "kernel.cu",
     cudaDevice->geDefaultCompileOptions(),
-    log
+    &log
 );
 ```
 
diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
index 71f2d3e7b9..d1c15822cd 100644
--- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
+++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp
@@ -105,7 +105,7 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler)
 		std::string(Source),
 		"cuda_fp16_discovery_probe.cu",
 		{nullptr,nullptr},
-		log,
+		&log,
 		0,
 		nullptr,
 		nullptr
diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp
index f81e6e6ade..c07af698b1 100644
--- a/src/nbl/video/CCUDAHandler.cpp
+++ b/src/nbl/video/CCUDAHandler.cpp
@@ -1192,9 +1192,10 @@ CCUDAHandler::SPTXResult CCUDAHandler::getPTX(cuda_interop::SNVRTCProgram prog)
 	return {std::move(ptx),nvrtc.pnvrtcGetPTX(nativeProgram,ptxPtr)};
 }
 
-static CCUDAHandler::SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, cuda_interop::SNVRTCResult result, cuda_interop::SNVRTCProgram program, core::SRange<const char* const> nvrtcOptions, std::string& log)
+static CCUDAHandler::SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, cuda_interop::SNVRTCResult result, cuda_interop::SNVRTCProgram program, core::SRange<const char* const> nvrtcOptions, std::string* log)
 {
-	log.clear();
+	if (log)
+		log->clear();
 	const nvrtcResult nativeResult = result;
 	if (nativeResult!=NVRTC_SUCCESS)
 		return {nullptr,result};
@@ -1210,7 +1211,8 @@ static CCUDAHandler::SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler,
 	const auto* optionsBegin = options.empty() ? nullptr:options.data();
 	const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size();
 	result = handler.compileProgram(program,{optionsBegin,optionsEnd});
-	handler.getProgramLog(program,log);
+	if (log)
+		handler.getProgramLog(program,*log);
 	if (static_cast<nvrtcResult>(result)!=NVRTC_SUCCESS)
 		return {nullptr,result};
 
@@ -1219,7 +1221,7 @@ static CCUDAHandler::SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler,
 
 CCUDAHandler::SPTXResult CCUDAHandler::compileDirectlyToPTX(
 	std::string&& source, const char* filename, core::SRange<const char* const> nvrtcOptions,
-	std::string& log, const int headerCount, const char* const* headerContents, const char* const* includeNames)
+	std::string* log, const int headerCount, const char* const* headerContents, const char* const* includeNames)
 {
 	cuda_interop::SNVRTCProgram program = {};
 	cuda_interop::SNVRTCResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE;

From 2d53e9af42dea6f618c5525199f3242ea5058fad Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 11 May 2026 09:56:09 +0200
Subject: [PATCH 42/51] Enable CUDA in Windows CI

---
 .github/workflows/build-nabla.yml | 154 +++++++++++++++++++++++++++++-
 CMakePresets.json                 |   2 +-
 2 files changed, 154 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 8a62da4fc7..dd782ec389 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -47,9 +47,95 @@ jobs:
           }
           & $rgExe --version
 
+  prepare-host-cuda:
+    name: Prepare host CUDA 13.2
+    runs-on: windows-2022
+
+    env:
+      cudaVersion: '13.2.1'
+      cudaMajorMinor: '13.2'
+      cudaInstallRoot: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2
+      cudaCacheRoot: ${{ runner.tool_cache }}\cuda\v13.2
+
+    steps:
+      - name: Restore CUDA Toolkit
+        id: cache-cuda
+        uses: actions/cache@v5
+        with:
+          path: ${{ env.cudaCacheRoot }}
+          key: cuda-toolkit-${{ env.cudaVersion }}-windows-x64-v1
+
+      - name: Install CUDA Toolkit
+        if: steps.cache-cuda.outputs.cache-hit != 'true'
+        shell: pwsh
+        run: |
+          function Test-CudaToolkit {
+            param([string]$Root)
+
+            $nvcc = Join-Path $Root 'bin\nvcc.exe'
+            $cudaH = Join-Path $Root 'include\cuda.h'
+            $nvrtcH = Join-Path $Root 'include\nvrtc.h'
+            if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) {
+              return $false
+            }
+
+            $version = & $nvcc --version 2>&1
+            return ($LASTEXITCODE -eq 0 -and ($version -match "release $env:cudaMajorMinor"))
+          }
+
+          if (Test-CudaToolkit $env:cudaCacheRoot) {
+            Write-Host "CUDA Toolkit $env:cudaMajorMinor already restored at $env:cudaCacheRoot"
+            return
+          }
+
+          if (-not (Test-CudaToolkit $env:cudaInstallRoot)) {
+            if (-not (Get-Command winget -ErrorAction SilentlyContinue)) {
+              throw "winget is required to install CUDA Toolkit $env:cudaVersion on the host runner."
+            }
+
+            winget install `
+              --exact `
+              --id Nvidia.CUDA `
+              --version $env:cudaVersion `
+              --source winget `
+              --accept-package-agreements `
+              --accept-source-agreements `
+              --disable-interactivity
+
+            if ($LASTEXITCODE -ne 0) {
+              throw "CUDA Toolkit $env:cudaVersion installation failed."
+            }
+          }
+
+          if (-not (Test-CudaToolkit $env:cudaInstallRoot)) {
+            throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaInstallRoot after installation."
+          }
+
+          New-Item -ItemType Directory -Force -Path $env:cudaCacheRoot | Out-Null
+          robocopy $env:cudaInstallRoot $env:cudaCacheRoot /MIR /R:2 /W:2 /NFL /NDL /NP
+          if ($LASTEXITCODE -gt 7) {
+            throw "Failed to mirror CUDA Toolkit into cache root. robocopy exit code: $LASTEXITCODE"
+          }
+          $global:LASTEXITCODE = 0
+
+      - name: Verify CUDA Toolkit
+        shell: pwsh
+        run: |
+          $nvcc = Join-Path $env:cudaCacheRoot 'bin\nvcc.exe'
+          $cudaH = Join-Path $env:cudaCacheRoot 'include\cuda.h'
+          $nvrtcH = Join-Path $env:cudaCacheRoot 'include\nvrtc.h'
+          if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) {
+            throw "CUDA Toolkit cache is incomplete at $env:cudaCacheRoot."
+          }
+          $version = & $nvcc --version
+          if ($LASTEXITCODE -ne 0 -or -not ($version -match "release $env:cudaMajorMinor")) {
+            throw "Expected CUDA Toolkit $env:cudaMajorMinor. nvcc output: $version"
+          }
+          $version
+
   build-windows:
     name: Nabla (${{ matrix.os }}, ${{ matrix.vendor }}-${{ matrix.tag }}, ${{ matrix.config }})
-    needs: prepare-host-rg
+    needs: [prepare-host-rg, prepare-host-cuda]
     runs-on: ${{ matrix.os }}
 
     env:
@@ -59,6 +145,9 @@ jobs:
       mount: C:\mount\nabla
       binary: C:\mount\nabla\build-ct
       install: build-ct\install
+      cudaHostRoot: ${{ runner.tool_cache }}\cuda\v13.2
+      cudaContainerRoot: C:\cuda\v13.2
+      cudaContainerRootCMake: C:/cuda/v13.2
 
     strategy:
       fail-fast: false
@@ -183,6 +272,28 @@ jobs:
           $rgDir | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
           & $rgExe --version
 
+      - name: Restore CUDA Toolkit
+        id: cache-cuda
+        uses: actions/cache@v5
+        with:
+          path: ${{ env.cudaHostRoot }}
+          key: cuda-toolkit-13.2.1-windows-x64-v1
+
+      - name: Verify CUDA Toolkit
+        shell: pwsh
+        run: |
+          $nvcc = Join-Path '${{ env.cudaHostRoot }}' 'bin\nvcc.exe'
+          $cudaH = Join-Path '${{ env.cudaHostRoot }}' 'include\cuda.h'
+          $nvrtcH = Join-Path '${{ env.cudaHostRoot }}' 'include\nvrtc.h'
+          if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) {
+            throw "CUDA Toolkit 13.2 cache was not restored to ${{ env.cudaHostRoot }}."
+          }
+          $version = & $nvcc --version
+          if ($LASTEXITCODE -ne 0 -or -not ($version -match 'release 13.2')) {
+            throw "Expected CUDA Toolkit 13.2. nvcc output: $version"
+          }
+          $version
+
       - name: Pull Image
         run: |
           docker pull "${{ env.image }}:${{ matrix.tag }}"
@@ -199,8 +310,11 @@ jobs:
             --env-file .\docker\ci-windows.env `
             --env-file .\docker\ninja.env `
             --env "NSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" `
+            --env "CUDA_PATH=${{ env.cudaContainerRoot }}" `
+            --env "CUDA_PATH_V13_2=${{ env.cudaContainerRoot }}" `
             --name orphan --network docker_default `
             -v "${{ github.workspace }}:${{ env.mount }}" `
+            -v "${{ env.cudaHostRoot }}:${{ env.cudaContainerRoot }}" `
             -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" `
             -w "${{ env.mount }}" `
             "${{ env.image }}:${{ matrix.tag }}" `
@@ -222,6 +336,7 @@ jobs:
             ${{ env.entry }} ${{ env.cmd }} -Command cmake `
               --preset ci-configure-dynamic-${{ matrix.vendor }} `
               -DCMAKE_INSTALL_PREFIX:PATH=C:/mount/nabla/build-ct/install `
+              -DNBL_CUDA_TOOLKIT_ROOT:PATH=${{ env.cudaContainerRootCMake }} `
               --profiling-output=profiling/cmake-profiling.json `
               --profiling-format=google-trace
 
@@ -623,6 +738,8 @@ jobs:
     name: Nabla / Smoke (${{ matrix.os }}, ${{ matrix.vendor }}-latest, ${{ matrix.config }})
     needs: build-windows
     runs-on: windows-2022
+    env:
+      cudaHostRoot: ${{ runner.tool_cache }}\cuda\v13.2
     strategy:
       fail-fast: false
       matrix:
@@ -636,6 +753,7 @@ jobs:
           fetch-depth: 1
           sparse-checkout: |
             smoke
+            src/nbl/ext/CUDAInterop/smoke
 
       - name: Download VulkanSDK
         uses: Devsh-Graphics-Programming/install-vulkan-sdk-action@v1.4.0-devsh.1
@@ -646,6 +764,28 @@ jobs:
             install_lavapipe: true
             github_token: ${{ github.token }}
 
+      - name: Restore CUDA Toolkit
+        id: cache-cuda
+        uses: actions/cache@v5
+        with:
+          path: ${{ env.cudaHostRoot }}
+          key: cuda-toolkit-13.2.1-windows-x64-v1
+
+      - name: Verify CUDA Toolkit
+        shell: pwsh
+        run: |
+          $nvcc = Join-Path '${{ env.cudaHostRoot }}' 'bin\nvcc.exe'
+          $cudaH = Join-Path '${{ env.cudaHostRoot }}' 'include\cuda.h'
+          $nvrtcH = Join-Path '${{ env.cudaHostRoot }}' 'include\nvrtc.h'
+          if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) {
+            throw "CUDA Toolkit 13.2 cache was not restored to ${{ env.cudaHostRoot }}."
+          }
+          $version = & $nvcc --version
+          if ($LASTEXITCODE -ne 0 -or -not ($version -match 'release 13.2')) {
+            throw "Expected CUDA Toolkit 13.2. nvcc output: $version"
+          }
+          $version
+
       - name: Download Nabla install artifact
         uses: actions/download-artifact@v8
         with:
@@ -668,3 +808,15 @@ jobs:
 
       - name: Smoke Flow BUILD_ONLY
         run: cmake -D FLOW=BUILD_ONLY -D CONFIG=${{ matrix.config }} -P smoke/RunSmokeFlow.cmake
+
+      - name: Build CUDA interop package smoke
+        shell: pwsh
+        run: |
+          cmake `
+            -S src/nbl/ext/CUDAInterop/smoke `
+            -B smoke/cuda-interop-smoke `
+            -D "CMAKE_PREFIX_PATH=${{ github.workspace }}\smoke\build-ct\install\cmake" `
+            -D "NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE=ON" `
+            -D "Nabla_CUDA_TOOLKIT_ROOT=${{ env.cudaHostRoot }}"
+
+          cmake --build smoke/cuda-interop-smoke --config ${{ matrix.config }}
diff --git a/CMakePresets.json b/CMakePresets.json
index 3c11567f46..2c25d06953 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -14,7 +14,7 @@
 				"NBL_EMBED_BUILTIN_RESOURCES": "ON",
 				"NBL_NSC_MODE": "SOURCE",
 				"NBL_UPDATE_GIT_SUBMODULE": "OFF",
-				"NBL_COMPILE_WITH_CUDA": "OFF",
+				"NBL_COMPILE_WITH_CUDA": "ON",
 				"NBL_BUILD_OPTIX": "OFF",
 				"NBL_BUILD_MITSUBA_LOADER": "ON",
 				"NBL_BUILD_RADEON_RAYS": "OFF",

From 0243ed07664ffb222d7e628e0414891dd75dc2c6 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 11 May 2026 10:31:57 +0200
Subject: [PATCH 43/51] Fix CUDA cache path in CI

---
 .github/workflows/build-nabla.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index dd782ec389..865fe7a0fc 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -55,7 +55,7 @@ jobs:
       cudaVersion: '13.2.1'
       cudaMajorMinor: '13.2'
       cudaInstallRoot: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2
-      cudaCacheRoot: ${{ runner.tool_cache }}\cuda\v13.2
+      cudaCacheRoot: C:\nabla-ci\cuda\v13.2
 
     steps:
       - name: Restore CUDA Toolkit
@@ -145,7 +145,7 @@ jobs:
       mount: C:\mount\nabla
       binary: C:\mount\nabla\build-ct
       install: build-ct\install
-      cudaHostRoot: ${{ runner.tool_cache }}\cuda\v13.2
+      cudaHostRoot: C:\nabla-ci\cuda\v13.2
       cudaContainerRoot: C:\cuda\v13.2
       cudaContainerRootCMake: C:/cuda/v13.2
 
@@ -739,7 +739,7 @@ jobs:
     needs: build-windows
     runs-on: windows-2022
     env:
-      cudaHostRoot: ${{ runner.tool_cache }}\cuda\v13.2
+      cudaHostRoot: C:\nabla-ci\cuda\v13.2
     strategy:
       fail-fast: false
       matrix:

From 4ea20f7c7caca2dc75f8ba90d8f636eb31d8285b Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 11 May 2026 11:26:32 +0200
Subject: [PATCH 44/51] Seed CUDA cache on Windows 2025

---
 .github/workflows/build-nabla.yml | 62 ++++++++++++++++---------------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 865fe7a0fc..aae2173fb8 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -49,7 +49,7 @@ jobs:
 
   prepare-host-cuda:
     name: Prepare host CUDA 13.2
-    runs-on: windows-2022
+    runs-on: windows-2025
 
     env:
       cudaVersion: '13.2.1'
@@ -63,7 +63,7 @@ jobs:
         uses: actions/cache@v5
         with:
           path: ${{ env.cudaCacheRoot }}
-          key: cuda-toolkit-${{ env.cudaVersion }}-windows-x64-v1
+          key: cuda-toolkit-${{ env.cudaVersion }}-windows-2025-x64-v1
 
       - name: Install CUDA Toolkit
         if: steps.cache-cuda.outputs.cache-hit != 'true'
@@ -75,7 +75,9 @@ jobs:
             $nvcc = Join-Path $Root 'bin\nvcc.exe'
             $cudaH = Join-Path $Root 'include\cuda.h'
             $nvrtcH = Join-Path $Root 'include\nvrtc.h'
-            if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) {
+            $fp16H = Join-Path $Root 'include\cuda_fp16.h'
+            $vectorTypesH = Join-Path $Root 'include\vector_types.h'
+            if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH) -or -not (Test-Path $fp16H) -or -not (Test-Path $vectorTypesH)) {
               return $false
             }
 
@@ -88,35 +90,33 @@ jobs:
             return
           }
 
-          if (-not (Test-CudaToolkit $env:cudaInstallRoot)) {
-            if (-not (Get-Command winget -ErrorAction SilentlyContinue)) {
-              throw "winget is required to install CUDA Toolkit $env:cudaVersion on the host runner."
-            }
-
-            winget install `
-              --exact `
-              --id Nvidia.CUDA `
-              --version $env:cudaVersion `
-              --source winget `
-              --accept-package-agreements `
-              --accept-source-agreements `
-              --disable-interactivity
-
-            if ($LASTEXITCODE -ne 0) {
-              throw "CUDA Toolkit $env:cudaVersion installation failed."
-            }
+          winget source update
+          winget install `
+            --exact `
+            --id Nvidia.CUDA `
+            --version $env:cudaVersion `
+            --source winget `
+            --location $env:cudaCacheRoot `
+            --accept-package-agreements `
+            --accept-source-agreements `
+            --disable-interactivity
+
+          if ($LASTEXITCODE -ne 0) {
+            throw "CUDA Toolkit $env:cudaVersion installation failed."
           }
 
-          if (-not (Test-CudaToolkit $env:cudaInstallRoot)) {
-            throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaInstallRoot after installation."
+          if (-not (Test-CudaToolkit $env:cudaCacheRoot) -and (Test-CudaToolkit $env:cudaInstallRoot)) {
+            New-Item -ItemType Directory -Force -Path $env:cudaCacheRoot | Out-Null
+            robocopy $env:cudaInstallRoot $env:cudaCacheRoot /MIR /R:2 /W:2 /NFL /NDL /NP
+            if ($LASTEXITCODE -gt 7) {
+              throw "Failed to mirror CUDA Toolkit into cache root. robocopy exit code: $LASTEXITCODE"
+            }
+            $global:LASTEXITCODE = 0
           }
 
-          New-Item -ItemType Directory -Force -Path $env:cudaCacheRoot | Out-Null
-          robocopy $env:cudaInstallRoot $env:cudaCacheRoot /MIR /R:2 /W:2 /NFL /NDL /NP
-          if ($LASTEXITCODE -gt 7) {
-            throw "Failed to mirror CUDA Toolkit into cache root. robocopy exit code: $LASTEXITCODE"
+          if (-not (Test-CudaToolkit $env:cudaCacheRoot)) {
+            throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaCacheRoot after installation."
           }
-          $global:LASTEXITCODE = 0
 
       - name: Verify CUDA Toolkit
         shell: pwsh
@@ -124,7 +124,9 @@ jobs:
           $nvcc = Join-Path $env:cudaCacheRoot 'bin\nvcc.exe'
           $cudaH = Join-Path $env:cudaCacheRoot 'include\cuda.h'
           $nvrtcH = Join-Path $env:cudaCacheRoot 'include\nvrtc.h'
-          if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) {
+          $fp16H = Join-Path $env:cudaCacheRoot 'include\cuda_fp16.h'
+          $vectorTypesH = Join-Path $env:cudaCacheRoot 'include\vector_types.h'
+          if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH) -or -not (Test-Path $fp16H) -or -not (Test-Path $vectorTypesH)) {
             throw "CUDA Toolkit cache is incomplete at $env:cudaCacheRoot."
           }
           $version = & $nvcc --version
@@ -277,7 +279,7 @@ jobs:
         uses: actions/cache@v5
         with:
           path: ${{ env.cudaHostRoot }}
-          key: cuda-toolkit-13.2.1-windows-x64-v1
+          key: cuda-toolkit-13.2.1-windows-2025-x64-v1
 
       - name: Verify CUDA Toolkit
         shell: pwsh
@@ -769,7 +771,7 @@ jobs:
         uses: actions/cache@v5
         with:
           path: ${{ env.cudaHostRoot }}
-          key: cuda-toolkit-13.2.1-windows-x64-v1
+          key: cuda-toolkit-13.2.1-windows-2025-x64-v1
 
       - name: Verify CUDA Toolkit
         shell: pwsh

From 85fbf7f9bc36e4fd15751344c1427a6176421243 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 11 May 2026 12:11:12 +0200
Subject: [PATCH 45/51] Use Choco for CUDA cache seed

---
 .github/workflows/build-nabla.yml | 34 ++++++++++++++-----------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index aae2173fb8..d8efabd3a4 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -63,7 +63,7 @@ jobs:
         uses: actions/cache@v5
         with:
           path: ${{ env.cudaCacheRoot }}
-          key: cuda-toolkit-${{ env.cudaVersion }}-windows-2025-x64-v1
+          key: cuda-toolkit-${{ env.cudaVersion }}-windows-2025-x64-choco-v1
 
       - name: Install CUDA Toolkit
         if: steps.cache-cuda.outputs.cache-hit != 'true'
@@ -90,29 +90,25 @@ jobs:
             return
           }
 
-          winget source update
-          winget install `
-            --exact `
-            --id Nvidia.CUDA `
+          choco install cuda `
             --version $env:cudaVersion `
-            --source winget `
-            --location $env:cudaCacheRoot `
-            --accept-package-agreements `
-            --accept-source-agreements `
-            --disable-interactivity
+            --yes `
+            --no-progress
 
           if ($LASTEXITCODE -ne 0) {
             throw "CUDA Toolkit $env:cudaVersion installation failed."
           }
 
-          if (-not (Test-CudaToolkit $env:cudaCacheRoot) -and (Test-CudaToolkit $env:cudaInstallRoot)) {
-            New-Item -ItemType Directory -Force -Path $env:cudaCacheRoot | Out-Null
-            robocopy $env:cudaInstallRoot $env:cudaCacheRoot /MIR /R:2 /W:2 /NFL /NDL /NP
-            if ($LASTEXITCODE -gt 7) {
-              throw "Failed to mirror CUDA Toolkit into cache root. robocopy exit code: $LASTEXITCODE"
-            }
-            $global:LASTEXITCODE = 0
+          if (-not (Test-CudaToolkit $env:cudaInstallRoot)) {
+            throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaInstallRoot after installation."
+          }
+
+          New-Item -ItemType Directory -Force -Path $env:cudaCacheRoot | Out-Null
+          robocopy $env:cudaInstallRoot $env:cudaCacheRoot /MIR /R:2 /W:2 /NFL /NDL /NP
+          if ($LASTEXITCODE -gt 7) {
+            throw "Failed to mirror CUDA Toolkit into cache root. robocopy exit code: $LASTEXITCODE"
           }
+          $global:LASTEXITCODE = 0
 
           if (-not (Test-CudaToolkit $env:cudaCacheRoot)) {
             throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaCacheRoot after installation."
@@ -279,7 +275,7 @@ jobs:
         uses: actions/cache@v5
         with:
           path: ${{ env.cudaHostRoot }}
-          key: cuda-toolkit-13.2.1-windows-2025-x64-v1
+          key: cuda-toolkit-13.2.1-windows-2025-x64-choco-v1
 
       - name: Verify CUDA Toolkit
         shell: pwsh
@@ -771,7 +767,7 @@ jobs:
         uses: actions/cache@v5
         with:
           path: ${{ env.cudaHostRoot }}
-          key: cuda-toolkit-13.2.1-windows-2025-x64-v1
+          key: cuda-toolkit-13.2.1-windows-2025-x64-choco-v1
 
       - name: Verify CUDA Toolkit
         shell: pwsh

From 6008285457ae6f1a6bd632a4755f8e4ddc33802b Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 11 May 2026 13:50:47 +0200
Subject: [PATCH 46/51] Update CUDA interop examples pointer

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index b4a8725d54..10022c5de1 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit b4a8725d54ca960e0d2c353ef08d5f40aa4c4e04
+Subproject commit 10022c5de1b8350b8a4c85c35871bcd84e4877a7

From 82d82a26138f43591f2c3ea3af2b7a287169181a Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 11 May 2026 15:20:59 +0200
Subject: [PATCH 47/51] Update CUDA interop examples pointer

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 10022c5de1..39d02e2602 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 10022c5de1b8350b8a4c85c35871bcd84e4877a7
+Subproject commit 39d02e26023c72a7d3241e5df85e9b7c4afacb84

From 828211c1cbe23b5662877ae446162a26108eb21c Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 11 May 2026 15:38:01 +0200
Subject: [PATCH 48/51] Retry CI image pull

---
 .github/workflows/build-nabla.yml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index d8efabd3a4..7e89d68e15 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -294,7 +294,16 @@ jobs:
 
       - name: Pull Image
         run: |
-          docker pull "${{ env.image }}:${{ matrix.tag }}"
+          $image = "${{ env.image }}:${{ matrix.tag }}"
+          for ($attempt = 1; $attempt -le 3; $attempt++) {
+            docker pull $image
+            if ($LASTEXITCODE -eq 0) {
+              exit 0
+            }
+            Write-Warning "docker pull failed for $image on attempt $attempt."
+            Start-Sleep -Seconds (15 * $attempt)
+          }
+          exit $LASTEXITCODE
 
       - name: Run Container
         run: |

From e913518df6105ef947101c4320bff066ea679a45 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 12 May 2026 07:16:25 +0200
Subject: [PATCH 49/51] Deduplicate CUDA CI setup

---
 .github/scripts/ci_cuda_toolkit.py | 155 +++++++++++++++++++++++++++++
 .github/workflows/build-nabla.yml  | 149 ++++++++-------------------
 2 files changed, 194 insertions(+), 110 deletions(-)
 create mode 100644 .github/scripts/ci_cuda_toolkit.py

diff --git a/.github/scripts/ci_cuda_toolkit.py b/.github/scripts/ci_cuda_toolkit.py
new file mode 100644
index 0000000000..ee76eaf0b5
--- /dev/null
+++ b/.github/scripts/ci_cuda_toolkit.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import platform
+import subprocess
+from pathlib import Path
+
+
+REQUIRED_HEADERS = (
+    "cuda.h",
+    "nvrtc.h",
+    "cuda_fp16.h",
+    "vector_types.h",
+)
+
+
+def cuda_version() -> str:
+    version = os.environ.get("CUDA_VERSION", "").strip()
+    if not version:
+        raise SystemExit("CUDA_VERSION is not set.")
+    parts = version.split(".")
+    if len(parts) < 2 or not all(part.isdigit() for part in parts[:2]):
+        raise SystemExit(f"CUDA_VERSION must start with major.minor, got: {version}")
+    return version
+
+
+def major_minor(version: str) -> str:
+    major, minor, *_ = version.split(".")
+    return f"{major}.{minor}"
+
+
+def windows_paths(version: str) -> dict[str, str]:
+    mm = major_minor(version)
+    major, minor = mm.split(".")
+    return {
+        "cache_root": rf"C:\nabla-ci\cuda\v{mm}",
+        "container_root": rf"C:\cuda\v{mm}",
+        "container_root_cmake": f"C:/cuda/v{mm}",
+        "version_env": f"CUDA_PATH_V{major}_{minor}",
+        "cache_key": f"cuda-toolkit-{version}-windows-2025-x64-choco-v1",
+    }
+
+
+def windows_install_root(version: str) -> Path:
+    return Path(rf"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v{major_minor(version)}")
+
+
+def paths() -> dict[str, str]:
+    version = cuda_version()
+    if platform.system() == "Windows":
+        return windows_paths(version)
+    mm = major_minor(version)
+    return {
+        "cache_root": f"/opt/nabla-ci/cuda/v{mm}",
+        "container_root": f"/cuda/v{mm}",
+        "container_root_cmake": f"/cuda/v{mm}",
+        "version_env": f"CUDA_PATH_V{mm.replace('.', '_')}",
+        "cache_key": f"cuda-toolkit-{version}-{platform.system().lower()}-x64-v1",
+    }
+
+
+def emit_outputs() -> None:
+    output = os.environ.get("GITHUB_OUTPUT")
+    values = paths()
+    lines = [f"{key}={value}" for key, value in values.items()]
+    if output:
+        with open(output, "a", encoding="utf-8") as file:
+            file.write("\n".join(lines))
+            file.write("\n")
+    else:
+        print("\n".join(lines))
+
+
+def run(command: list[str], **kwargs) -> subprocess.CompletedProcess:
+    print("+", " ".join(command))
+    return subprocess.run(command, check=False, text=True, **kwargs)
+
+
+def nvcc_path(root: Path) -> Path:
+    executable = "nvcc.exe" if platform.system() == "Windows" else "nvcc"
+    return root / "bin" / executable
+
+
+def verify_toolkit(root: Path, version: str) -> bool:
+    missing = [str(nvcc_path(root))]
+    missing.extend(str(root / "include" / header) for header in REQUIRED_HEADERS)
+    missing = [path for path in missing if not Path(path).exists()]
+    if missing:
+        print(f"CUDA Toolkit is incomplete at {root}.")
+        for path in missing:
+            print(f"missing: {path}")
+        return False
+
+    result = run([str(nvcc_path(root)), "--version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    print(result.stdout)
+    expected = f"release {major_minor(version)}"
+    if result.returncode != 0 or expected not in result.stdout:
+        print(f"Expected CUDA Toolkit {major_minor(version)} at {root}.")
+        return False
+    return True
+
+
+def verify() -> None:
+    version = cuda_version()
+    values = paths()
+    root = Path(os.environ.get("CUDA_TOOLKIT_ROOT", values["cache_root"]))
+    if not verify_toolkit(root, version):
+        raise SystemExit(1)
+
+
+def install() -> None:
+    if platform.system() != "Windows":
+        raise SystemExit("CUDA Toolkit install is only implemented for Windows CI.")
+
+    version = cuda_version()
+    values = paths()
+    install_root = windows_install_root(version)
+    cache_root = Path(values["cache_root"])
+
+    if verify_toolkit(cache_root, version):
+        print(f"CUDA Toolkit {major_minor(version)} already restored at {cache_root}")
+        return
+
+    result = run(["choco", "install", "cuda", "--version", version, "--yes", "--no-progress"])
+    if result.returncode != 0:
+        raise SystemExit(f"CUDA Toolkit {version} installation failed.")
+
+    if not verify_toolkit(install_root, version):
+        raise SystemExit(f"CUDA Toolkit {major_minor(version)} was not found at {install_root} after installation.")
+
+    cache_root.mkdir(parents=True, exist_ok=True)
+    result = run(["robocopy", str(install_root), str(cache_root), "/MIR", "/R:2", "/W:2", "/NFL", "/NDL", "/NP"])
+    if result.returncode > 7:
+        raise SystemExit(f"Failed to mirror CUDA Toolkit into cache root. robocopy exit code: {result.returncode}")
+
+    if not verify_toolkit(cache_root, version):
+        raise SystemExit(f"CUDA Toolkit {major_minor(version)} was not found at {cache_root} after installation.")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("command", choices=("outputs", "install", "verify"))
+    args = parser.parse_args()
+
+    if args.command == "outputs":
+        emit_outputs()
+    elif args.command == "install":
+        install()
+    elif args.command == "verify":
+        verify()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index 7e89d68e15..e151c291e0 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -9,6 +9,9 @@ permissions:
   contents: read
   actions: read
 
+env:
+  CUDA_VERSION: '13.2.1'
+
 concurrency:
   group: push-lock-${{ github.ref }}
   cancel-in-progress: true
@@ -48,88 +51,34 @@ jobs:
           & $rgExe --version
 
   prepare-host-cuda:
-    name: Prepare host CUDA 13.2
+    name: Prepare host CUDA
     runs-on: windows-2025
 
-    env:
-      cudaVersion: '13.2.1'
-      cudaMajorMinor: '13.2'
-      cudaInstallRoot: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2
-      cudaCacheRoot: C:\nabla-ci\cuda\v13.2
-
     steps:
+      - name: Checkout CUDA CI helper
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 1
+          sparse-checkout: |
+            .github/scripts
+
+      - name: CUDA Toolkit paths
+        id: cuda
+        run: python .github/scripts/ci_cuda_toolkit.py outputs
+
       - name: Restore CUDA Toolkit
         id: cache-cuda
         uses: actions/cache@v5
         with:
-          path: ${{ env.cudaCacheRoot }}
-          key: cuda-toolkit-${{ env.cudaVersion }}-windows-2025-x64-choco-v1
+          path: ${{ steps.cuda.outputs.cache_root }}
+          key: ${{ steps.cuda.outputs.cache_key }}
 
       - name: Install CUDA Toolkit
         if: steps.cache-cuda.outputs.cache-hit != 'true'
-        shell: pwsh
-        run: |
-          function Test-CudaToolkit {
-            param([string]$Root)
-
-            $nvcc = Join-Path $Root 'bin\nvcc.exe'
-            $cudaH = Join-Path $Root 'include\cuda.h'
-            $nvrtcH = Join-Path $Root 'include\nvrtc.h'
-            $fp16H = Join-Path $Root 'include\cuda_fp16.h'
-            $vectorTypesH = Join-Path $Root 'include\vector_types.h'
-            if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH) -or -not (Test-Path $fp16H) -or -not (Test-Path $vectorTypesH)) {
-              return $false
-            }
-
-            $version = & $nvcc --version 2>&1
-            return ($LASTEXITCODE -eq 0 -and ($version -match "release $env:cudaMajorMinor"))
-          }
-
-          if (Test-CudaToolkit $env:cudaCacheRoot) {
-            Write-Host "CUDA Toolkit $env:cudaMajorMinor already restored at $env:cudaCacheRoot"
-            return
-          }
-
-          choco install cuda `
-            --version $env:cudaVersion `
-            --yes `
-            --no-progress
-
-          if ($LASTEXITCODE -ne 0) {
-            throw "CUDA Toolkit $env:cudaVersion installation failed."
-          }
-
-          if (-not (Test-CudaToolkit $env:cudaInstallRoot)) {
-            throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaInstallRoot after installation."
-          }
-
-          New-Item -ItemType Directory -Force -Path $env:cudaCacheRoot | Out-Null
-          robocopy $env:cudaInstallRoot $env:cudaCacheRoot /MIR /R:2 /W:2 /NFL /NDL /NP
-          if ($LASTEXITCODE -gt 7) {
-            throw "Failed to mirror CUDA Toolkit into cache root. robocopy exit code: $LASTEXITCODE"
-          }
-          $global:LASTEXITCODE = 0
-
-          if (-not (Test-CudaToolkit $env:cudaCacheRoot)) {
-            throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaCacheRoot after installation."
-          }
+        run: python .github/scripts/ci_cuda_toolkit.py install
 
       - name: Verify CUDA Toolkit
-        shell: pwsh
-        run: |
-          $nvcc = Join-Path $env:cudaCacheRoot 'bin\nvcc.exe'
-          $cudaH = Join-Path $env:cudaCacheRoot 'include\cuda.h'
-          $nvrtcH = Join-Path $env:cudaCacheRoot 'include\nvrtc.h'
-          $fp16H = Join-Path $env:cudaCacheRoot 'include\cuda_fp16.h'
-          $vectorTypesH = Join-Path $env:cudaCacheRoot 'include\vector_types.h'
-          if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH) -or -not (Test-Path $fp16H) -or -not (Test-Path $vectorTypesH)) {
-            throw "CUDA Toolkit cache is incomplete at $env:cudaCacheRoot."
-          }
-          $version = & $nvcc --version
-          if ($LASTEXITCODE -ne 0 -or -not ($version -match "release $env:cudaMajorMinor")) {
-            throw "Expected CUDA Toolkit $env:cudaMajorMinor. nvcc output: $version"
-          }
-          $version
+        run: python .github/scripts/ci_cuda_toolkit.py verify
 
   build-windows:
     name: Nabla (${{ matrix.os }}, ${{ matrix.vendor }}-${{ matrix.tag }}, ${{ matrix.config }})
@@ -143,9 +92,6 @@ jobs:
       mount: C:\mount\nabla
       binary: C:\mount\nabla\build-ct
       install: build-ct\install
-      cudaHostRoot: C:\nabla-ci\cuda\v13.2
-      cudaContainerRoot: C:\cuda\v13.2
-      cudaContainerRootCMake: C:/cuda/v13.2
 
     strategy:
       fail-fast: false
@@ -252,6 +198,10 @@ jobs:
         with:
           submodules: 'recursive'
 
+      - name: CUDA Toolkit paths
+        id: cuda
+        run: python .github/scripts/ci_cuda_toolkit.py outputs
+
       - name: Restore ripgrep host tool
         id: cache-rg
         uses: actions/cache@v5
@@ -274,23 +224,11 @@ jobs:
         id: cache-cuda
         uses: actions/cache@v5
         with:
-          path: ${{ env.cudaHostRoot }}
-          key: cuda-toolkit-13.2.1-windows-2025-x64-choco-v1
+          path: ${{ steps.cuda.outputs.cache_root }}
+          key: ${{ steps.cuda.outputs.cache_key }}
 
       - name: Verify CUDA Toolkit
-        shell: pwsh
-        run: |
-          $nvcc = Join-Path '${{ env.cudaHostRoot }}' 'bin\nvcc.exe'
-          $cudaH = Join-Path '${{ env.cudaHostRoot }}' 'include\cuda.h'
-          $nvrtcH = Join-Path '${{ env.cudaHostRoot }}' 'include\nvrtc.h'
-          if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) {
-            throw "CUDA Toolkit 13.2 cache was not restored to ${{ env.cudaHostRoot }}."
-          }
-          $version = & $nvcc --version
-          if ($LASTEXITCODE -ne 0 -or -not ($version -match 'release 13.2')) {
-            throw "Expected CUDA Toolkit 13.2. nvcc output: $version"
-          }
-          $version
+        run: python .github/scripts/ci_cuda_toolkit.py verify
 
       - name: Pull Image
         run: |
@@ -317,11 +255,11 @@ jobs:
             --env-file .\docker\ci-windows.env `
             --env-file .\docker\ninja.env `
             --env "NSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" `
-            --env "CUDA_PATH=${{ env.cudaContainerRoot }}" `
-            --env "CUDA_PATH_V13_2=${{ env.cudaContainerRoot }}" `
+            --env "CUDA_PATH=${{ steps.cuda.outputs.container_root }}" `
+            --env "${{ steps.cuda.outputs.version_env }}=${{ steps.cuda.outputs.container_root }}" `
             --name orphan --network docker_default `
             -v "${{ github.workspace }}:${{ env.mount }}" `
-            -v "${{ env.cudaHostRoot }}:${{ env.cudaContainerRoot }}" `
+            -v "${{ steps.cuda.outputs.cache_root }}:${{ steps.cuda.outputs.container_root }}" `
             -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" `
             -w "${{ env.mount }}" `
             "${{ env.image }}:${{ matrix.tag }}" `
@@ -343,7 +281,7 @@ jobs:
             ${{ env.entry }} ${{ env.cmd }} -Command cmake `
               --preset ci-configure-dynamic-${{ matrix.vendor }} `
               -DCMAKE_INSTALL_PREFIX:PATH=C:/mount/nabla/build-ct/install `
-              -DNBL_CUDA_TOOLKIT_ROOT:PATH=${{ env.cudaContainerRootCMake }} `
+              -DNBL_CUDA_TOOLKIT_ROOT:PATH=${{ steps.cuda.outputs.container_root_cmake }} `
               --profiling-output=profiling/cmake-profiling.json `
               --profiling-format=google-trace
 
@@ -745,8 +683,6 @@ jobs:
     name: Nabla / Smoke (${{ matrix.os }}, ${{ matrix.vendor }}-latest, ${{ matrix.config }})
     needs: build-windows
     runs-on: windows-2022
-    env:
-      cudaHostRoot: C:\nabla-ci\cuda\v13.2
     strategy:
       fail-fast: false
       matrix:
@@ -759,9 +695,14 @@ jobs:
         with:
           fetch-depth: 1
           sparse-checkout: |
+            .github/scripts
             smoke
             src/nbl/ext/CUDAInterop/smoke
 
+      - name: CUDA Toolkit paths
+        id: cuda
+        run: python .github/scripts/ci_cuda_toolkit.py outputs
+
       - name: Download VulkanSDK
         uses: Devsh-Graphics-Programming/install-vulkan-sdk-action@v1.4.0-devsh.1
         with:
@@ -775,23 +716,11 @@ jobs:
         id: cache-cuda
         uses: actions/cache@v5
         with:
-          path: ${{ env.cudaHostRoot }}
-          key: cuda-toolkit-13.2.1-windows-2025-x64-choco-v1
+          path: ${{ steps.cuda.outputs.cache_root }}
+          key: ${{ steps.cuda.outputs.cache_key }}
 
       - name: Verify CUDA Toolkit
-        shell: pwsh
-        run: |
-          $nvcc = Join-Path '${{ env.cudaHostRoot }}' 'bin\nvcc.exe'
-          $cudaH = Join-Path '${{ env.cudaHostRoot }}' 'include\cuda.h'
-          $nvrtcH = Join-Path '${{ env.cudaHostRoot }}' 'include\nvrtc.h'
-          if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) {
-            throw "CUDA Toolkit 13.2 cache was not restored to ${{ env.cudaHostRoot }}."
-          }
-          $version = & $nvcc --version
-          if ($LASTEXITCODE -ne 0 -or -not ($version -match 'release 13.2')) {
-            throw "Expected CUDA Toolkit 13.2. nvcc output: $version"
-          }
-          $version
+        run: python .github/scripts/ci_cuda_toolkit.py verify
 
       - name: Download Nabla install artifact
         uses: actions/download-artifact@v8
@@ -824,6 +753,6 @@ jobs:
             -B smoke/cuda-interop-smoke `
             -D "CMAKE_PREFIX_PATH=${{ github.workspace }}\smoke\build-ct\install\cmake" `
             -D "NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE=ON" `
-            -D "Nabla_CUDA_TOOLKIT_ROOT=${{ env.cudaHostRoot }}"
+            -D "Nabla_CUDA_TOOLKIT_ROOT=${{ steps.cuda.outputs.cache_root }}"
 
           cmake --build smoke/cuda-interop-smoke --config ${{ matrix.config }}

From f74efe822c9681eed2a474254a85af6722000660 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 12 May 2026 07:46:21 +0200
Subject: [PATCH 50/51] Simplify CUDA CI cache handling

---
 .github/scripts/ci_cuda_toolkit.py | 89 +++++++++---------------------
 .github/workflows/build-nabla.yml  | 15 +++--
 2 files changed, 32 insertions(+), 72 deletions(-)

diff --git a/.github/scripts/ci_cuda_toolkit.py b/.github/scripts/ci_cuda_toolkit.py
index ee76eaf0b5..04bc1efc91 100644
--- a/.github/scripts/ci_cuda_toolkit.py
+++ b/.github/scripts/ci_cuda_toolkit.py
@@ -30,40 +30,33 @@ def major_minor(version: str) -> str:
     return f"{major}.{minor}"
 
 
-def windows_paths(version: str) -> dict[str, str]:
+def cache_root(version: str) -> str:
     mm = major_minor(version)
-    major, minor = mm.split(".")
-    return {
-        "cache_root": rf"C:\nabla-ci\cuda\v{mm}",
-        "container_root": rf"C:\cuda\v{mm}",
-        "container_root_cmake": f"C:/cuda/v{mm}",
-        "version_env": f"CUDA_PATH_V{major}_{minor}",
-        "cache_key": f"cuda-toolkit-{version}-windows-2025-x64-choco-v1",
-    }
+    if platform.system() == "Windows":
+        return rf"C:\nabla-ci\cuda\v{mm}"
+    return f"/opt/nabla-ci/cuda/v{mm}"
 
 
-def windows_install_root(version: str) -> Path:
-    return Path(rf"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v{major_minor(version)}")
+def cache_key(version: str) -> str:
+    if platform.system() == "Windows":
+        return f"cuda-toolkit-{version}-windows-2025-x64-v2"
+    return f"cuda-toolkit-{version}-{platform.system().lower()}-x64-v1"
 
 
-def paths() -> dict[str, str]:
-    version = cuda_version()
+def cache_restore_key(version: str) -> str:
     if platform.system() == "Windows":
-        return windows_paths(version)
-    mm = major_minor(version)
-    return {
-        "cache_root": f"/opt/nabla-ci/cuda/v{mm}",
-        "container_root": f"/cuda/v{mm}",
-        "container_root_cmake": f"/cuda/v{mm}",
-        "version_env": f"CUDA_PATH_V{mm.replace('.', '_')}",
-        "cache_key": f"cuda-toolkit-{version}-{platform.system().lower()}-x64-v1",
-    }
+        return f"cuda-toolkit-{version}-windows-2025-x64-"
+    return f"cuda-toolkit-{version}-{platform.system().lower()}-x64-"
 
 
 def emit_outputs() -> None:
+    version = cuda_version()
+    lines = (
+        f"cache_root={cache_root(version)}",
+        f"cache_key={cache_key(version)}",
+        f"cache_restore_key={cache_restore_key(version)}",
+    )
     output = os.environ.get("GITHUB_OUTPUT")
-    values = paths()
-    lines = [f"{key}={value}" for key, value in values.items()]
     if output:
         with open(output, "a", encoding="utf-8") as file:
             file.write("\n".join(lines))
@@ -72,22 +65,22 @@ def emit_outputs() -> None:
         print("\n".join(lines))
 
 
-def run(command: list[str], **kwargs) -> subprocess.CompletedProcess:
-    print("+", " ".join(command))
-    return subprocess.run(command, check=False, text=True, **kwargs)
-
-
 def nvcc_path(root: Path) -> Path:
     executable = "nvcc.exe" if platform.system() == "Windows" else "nvcc"
     return root / "bin" / executable
 
 
+def run(command: list[str], **kwargs) -> subprocess.CompletedProcess:
+    print("+", " ".join(command))
+    return subprocess.run(command, check=False, text=True, **kwargs)
+
+
 def verify_toolkit(root: Path, version: str) -> bool:
     missing = [str(nvcc_path(root))]
     missing.extend(str(root / "include" / header) for header in REQUIRED_HEADERS)
     missing = [path for path in missing if not Path(path).exists()]
     if missing:
-        print(f"CUDA Toolkit is incomplete at {root}.")
+        print(f"CUDA Toolkit cache is incomplete at {root}.")
         for path in missing:
             print(f"missing: {path}")
         return False
@@ -103,50 +96,18 @@ def verify_toolkit(root: Path, version: str) -> bool:
 
 def verify() -> None:
     version = cuda_version()
-    values = paths()
-    root = Path(os.environ.get("CUDA_TOOLKIT_ROOT", values["cache_root"]))
+    root = Path(os.environ.get("CUDA_TOOLKIT_ROOT", cache_root(version)))
     if not verify_toolkit(root, version):
         raise SystemExit(1)
 
 
-def install() -> None:
-    if platform.system() != "Windows":
-        raise SystemExit("CUDA Toolkit install is only implemented for Windows CI.")
-
-    version = cuda_version()
-    values = paths()
-    install_root = windows_install_root(version)
-    cache_root = Path(values["cache_root"])
-
-    if verify_toolkit(cache_root, version):
-        print(f"CUDA Toolkit {major_minor(version)} already restored at {cache_root}")
-        return
-
-    result = run(["choco", "install", "cuda", "--version", version, "--yes", "--no-progress"])
-    if result.returncode != 0:
-        raise SystemExit(f"CUDA Toolkit {version} installation failed.")
-
-    if not verify_toolkit(install_root, version):
-        raise SystemExit(f"CUDA Toolkit {major_minor(version)} was not found at {install_root} after installation.")
-
-    cache_root.mkdir(parents=True, exist_ok=True)
-    result = run(["robocopy", str(install_root), str(cache_root), "/MIR", "/R:2", "/W:2", "/NFL", "/NDL", "/NP"])
-    if result.returncode > 7:
-        raise SystemExit(f"Failed to mirror CUDA Toolkit into cache root. robocopy exit code: {result.returncode}")
-
-    if not verify_toolkit(cache_root, version):
-        raise SystemExit(f"CUDA Toolkit {major_minor(version)} was not found at {cache_root} after installation.")
-
-
 def main() -> None:
     parser = argparse.ArgumentParser()
-    parser.add_argument("command", choices=("outputs", "install", "verify"))
+    parser.add_argument("command", choices=("outputs", "verify"))
     args = parser.parse_args()
 
     if args.command == "outputs":
         emit_outputs()
-    elif args.command == "install":
-        install()
     elif args.command == "verify":
         verify()
 
diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index e151c291e0..e61debb623 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -11,6 +11,7 @@ permissions:
 
 env:
   CUDA_VERSION: '13.2.1'
+  CUDA_CONTAINER_ROOT: C:\cuda
 
 concurrency:
   group: push-lock-${{ github.ref }}
@@ -72,10 +73,7 @@ jobs:
         with:
           path: ${{ steps.cuda.outputs.cache_root }}
           key: ${{ steps.cuda.outputs.cache_key }}
-
-      - name: Install CUDA Toolkit
-        if: steps.cache-cuda.outputs.cache-hit != 'true'
-        run: python .github/scripts/ci_cuda_toolkit.py install
+          restore-keys: ${{ steps.cuda.outputs.cache_restore_key }}
 
       - name: Verify CUDA Toolkit
         run: python .github/scripts/ci_cuda_toolkit.py verify
@@ -226,6 +224,7 @@ jobs:
         with:
           path: ${{ steps.cuda.outputs.cache_root }}
           key: ${{ steps.cuda.outputs.cache_key }}
+          restore-keys: ${{ steps.cuda.outputs.cache_restore_key }}
 
       - name: Verify CUDA Toolkit
         run: python .github/scripts/ci_cuda_toolkit.py verify
@@ -255,11 +254,10 @@ jobs:
             --env-file .\docker\ci-windows.env `
             --env-file .\docker\ninja.env `
             --env "NSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" `
-            --env "CUDA_PATH=${{ steps.cuda.outputs.container_root }}" `
-            --env "${{ steps.cuda.outputs.version_env }}=${{ steps.cuda.outputs.container_root }}" `
+            --env "CUDA_PATH=${{ env.CUDA_CONTAINER_ROOT }}" `
             --name orphan --network docker_default `
             -v "${{ github.workspace }}:${{ env.mount }}" `
-            -v "${{ steps.cuda.outputs.cache_root }}:${{ steps.cuda.outputs.container_root }}" `
+            -v "${{ steps.cuda.outputs.cache_root }}:${{ env.CUDA_CONTAINER_ROOT }}" `
             -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" `
             -w "${{ env.mount }}" `
             "${{ env.image }}:${{ matrix.tag }}" `
@@ -281,7 +279,7 @@ jobs:
             ${{ env.entry }} ${{ env.cmd }} -Command cmake `
               --preset ci-configure-dynamic-${{ matrix.vendor }} `
               -DCMAKE_INSTALL_PREFIX:PATH=C:/mount/nabla/build-ct/install `
-              -DNBL_CUDA_TOOLKIT_ROOT:PATH=${{ steps.cuda.outputs.container_root_cmake }} `
+              -DNBL_CUDA_TOOLKIT_ROOT:PATH=${{ env.CUDA_CONTAINER_ROOT }} `
               --profiling-output=profiling/cmake-profiling.json `
               --profiling-format=google-trace
 
@@ -718,6 +716,7 @@ jobs:
         with:
           path: ${{ steps.cuda.outputs.cache_root }}
           key: ${{ steps.cuda.outputs.cache_key }}
+          restore-keys: ${{ steps.cuda.outputs.cache_restore_key }}
 
       - name: Verify CUDA Toolkit
         run: python .github/scripts/ci_cuda_toolkit.py verify

From 920f2ef7b8f07a5d9dcf500e500ccd09500b9987 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Tue, 12 May 2026 08:16:28 +0200
Subject: [PATCH 51/51] Keep CUDA CI paths configurable

---
 .github/scripts/ci_cuda_toolkit.py | 12 ++++++++----
 .github/workflows/build-nabla.yml  |  3 ++-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/scripts/ci_cuda_toolkit.py b/.github/scripts/ci_cuda_toolkit.py
index 04bc1efc91..975dc7101d 100644
--- a/.github/scripts/ci_cuda_toolkit.py
+++ b/.github/scripts/ci_cuda_toolkit.py
@@ -30,11 +30,15 @@ def major_minor(version: str) -> str:
     return f"{major}.{minor}"
 
 
+def cache_base() -> Path:
+    base = os.environ.get("CUDA_CACHE_BASE", "").strip()
+    if not base:
+        raise SystemExit("CUDA_CACHE_BASE is not set.")
+    return Path(base)
+
+
 def cache_root(version: str) -> str:
-    mm = major_minor(version)
-    if platform.system() == "Windows":
-        return rf"C:\nabla-ci\cuda\v{mm}"
-    return f"/opt/nabla-ci/cuda/v{mm}"
+    return str(cache_base() / f"v{major_minor(version)}")
 
 
 def cache_key(version: str) -> str:
diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
index e61debb623..d6d593ebc5 100644
--- a/.github/workflows/build-nabla.yml
+++ b/.github/workflows/build-nabla.yml
@@ -11,7 +11,8 @@ permissions:
 
 env:
   CUDA_VERSION: '13.2.1'
-  CUDA_CONTAINER_ROOT: C:\cuda
+  CUDA_CACHE_BASE: 'C:\nabla-ci\cuda'
+  CUDA_CONTAINER_ROOT: 'C:\cuda'
 
 concurrency:
   group: push-lock-${{ github.ref }}