Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
f22f11c
Initial implementation of CUDA interop unit test
Mar 3, 2026
b8abd20
Dummy
Mar 23, 2026
93ca5ef
Refactor test into separate section
kevyuu Apr 22, 2026
03d2ce2
Update to follow latest commit on main repo
kevyuu Apr 23, 2026
1e120e8
Fix ex 67 due to changes in memory allocation
kevyuu Apr 25, 2026
fc00a68
ASSERT_SUCCESS into ASSERT_CUDA_SUCCESS
kevyuu Apr 30, 2026
0057225
Refactor ASSERT_CUDA_SUCCESS
kevyuu May 4, 2026
82d0592
Slight naming refactor
kevyuu May 4, 2026
a229db2
Remove unused commented code
kevyuu May 4, 2026
feac63d
Build CUDA interop example through extension target
AnastaZIuk May 6, 2026
6f136a2
Simplify CUDA interop example link
AnastaZIuk May 6, 2026
b17beb2
Use CUDA interop native target
AnastaZIuk May 6, 2026
fd50fda
Use native CUDA accessors
AnastaZIuk May 6, 2026
24525f0
Use CUDA interop target
AnastaZIuk May 6, 2026
4671898
Use CUDA native interop helper
AnastaZIuk May 7, 2026
acdcfc8
Use CUDA interop helper in example
AnastaZIuk May 7, 2026
d5aa23b
Use CUDA interop accessors
AnastaZIuk May 7, 2026
5031a24
Use explicit CUDA compile log
AnastaZIuk May 7, 2026
7b5817a
Fix CUDA interop example assert helper
AnastaZIuk May 7, 2026
2d415af
Use opaque CUDA interop handles
AnastaZIuk May 8, 2026
e289ee1
Use opaque CUDA interop calls
AnastaZIuk May 9, 2026
b4601fc
Use native CUDA interop conversion
AnastaZIuk May 9, 2026
d373d31
Fix CUDA interop smoke validation
AnastaZIuk May 10, 2026
a6268bc
Use CUDA interop assert helper
AnastaZIuk May 10, 2026
eb8f44a
Use native CUDA interop handles in EX76
AnastaZIuk May 10, 2026
3944176
Pass CUDA handler pointer to assert macro
AnastaZIuk May 10, 2026
b4a8725
Polish CUDA interop example usage
AnastaZIuk May 11, 2026
10022c5
Merge master into CUDA interop examples
AnastaZIuk May 11, 2026
39d02e2
Fix path tracer allocation size access
AnastaZIuk May 11, 2026
951bc99
Initial implementation of testWmmaGemm test
kevyuu May 12, 2026
8e84dcd
Remove test for WmmaGemm half
kevyuu May 12, 2026
fcc4a49
Merge branch 'master' into vk_cuda_interop
kevyuu May 12, 2026
854ced6
Merge vk_cuda_interop into CUDA interop examples
AnastaZIuk May 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions 40_PathTracer/src/renderer/CRenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ core::smart_refctd_ptr<CScene> CRenderer::createScene(CScene::SCreationParams&&
auto retval = device->allocate(info);
// map what is mappable by default so ReBAR checks succeed
if (retval.isValid() && retval.memory->isMappable())
retval.memory->map({.offset=0,.length=info.size});
retval.memory->map({.offset=0,.length=info.allocationSize});
return retval;
}

Expand Down Expand Up @@ -896,4 +896,4 @@ IQueue::SSubmitInfo::SSemaphoreInfo CRenderer::SSubmit::operator()(std::span<con
return rendered[0];
}

}
}
2 changes: 1 addition & 1 deletion 67_RayQueryGeometry/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
auto retval = device->allocate(info);
// map what is mappable by default so ReBAR checks succeed
if (retval.isValid() && retval.memory->isMappable())
retval.memory->map({.offset=0,.length=info.size});
retval.memory->map({.offset=0,.length=info.allocationSize});
return retval;
}

Expand Down
2 changes: 1 addition & 1 deletion 71_RayTracingPipeline/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1260,7 +1260,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
auto retval = device->allocate(info);
// map what is mappable by default so ReBAR checks succeed
if (retval.isValid() && retval.memory->isMappable())
retval.memory->map({ .offset = 0,.length = info.size });
retval.memory->map({ .offset = 0,.length = info.allocationSize });
return retval;
}

Expand Down
26 changes: 26 additions & 0 deletions 76_CudaInterop/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")

nbl_target_link_cuda_interop(${EXECUTABLE_NAME} PRIVATE)

if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()
40 changes: 40 additions & 0 deletions 76_CudaInterop/app_resources/vectorAdd_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/**
* CUDA Kernel Device code
*
* Computes the vector addition of A and B into C. The 3 vectors have the same
* number of elements numElements.
*/

extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
size_t numElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
C[i] = A[i] + B[i];
}
53 changes: 53 additions & 0 deletions 76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include <mma.h>
#include <cuda_runtime.h>

using namespace nvcuda;

// Define WMMA parameters
const int WMMA_M = 8;
const int WMMA_N = 8;
const int WMMA_K = 128;

extern "C" __global__ void b1_wmma_gemm_kernel(int* a, int* b, int* c,
int M, int N, int K) {
// Leading dimensions
int lda = K;
int ldb = K;
int ldc = N;

// Tile indices
int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
int warpN = (blockIdx.y * blockDim.y + threadIdx.y);

// Fragments
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::row_major> a_frag;
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::col_major> b_frag;
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> acc_frag;

// Initialize accumulator with zeros
wmma::fill_fragment(acc_frag, 0);

// Loop over the K-dimension
for (int i = 0; i < K; i += WMMA_K) {
int aRow = warpM * WMMA_M;
int aCol = i / 32; // Indexing uint32_t

int bRow = i / 32;
int bCol = warpN * WMMA_N;

// Load fragments
// Note: load_matrix_sync handles the bit-packing layout internally
wmma::load_matrix_sync(a_frag, a + (aRow * lda / 32 + aCol), lda);
wmma::load_matrix_sync(b_frag, b + (bCol * ldb / 32 + bRow), ldb);

// Perform XOR-Popcount MMA
wmma::bmma_sync(acc_frag, a_frag, b_frag, acc_frag, wmma::experimental::bmmaBitOpAND);
}

// Store the result
int cRow = warpM * WMMA_M;
int cCol = warpN * WMMA_N;
int* outputLoc = c + (cRow * ldc + cCol);
wmma::store_matrix_sync(outputLoc, acc_frag, ldc, wmma::mem_row_major);

}
Loading