Devsh-Graphics-Programming · kevyuu · Mar 3, 2026 · Mar 23, 2026 · Apr 22, 2026 · Apr 23, 2026
diff --git a/40_PathTracer/src/renderer/CRenderer.cpp b/40_PathTracer/src/renderer/CRenderer.cpp
@@ -553,7 +553,7 @@ core::smart_refctd_ptr<CScene> CRenderer::createScene(CScene::SCreationParams&&
 				auto retval = device->allocate(info);
 				// map what is mappable by default so ReBAR checks succeed
 				if (retval.isValid() && retval.memory->isMappable())
-					retval.memory->map({.offset=0,.length=info.size});
+					retval.memory->map({.offset=0,.length=info.allocationSize});
 				return retval;
 			}
 
@@ -896,4 +896,4 @@ IQueue::SSubmitInfo::SSemaphoreInfo CRenderer::SSubmit::operator()(std::span<con
 	return rendered[0];
 }
 
-}
+}
diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
@@ -664,7 +664,7 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 					auto retval = device->allocate(info);
 					// map what is mappable by default so ReBAR checks succeed
 					if (retval.isValid() && retval.memory->isMappable())
-						retval.memory->map({.offset=0,.length=info.size});
+						retval.memory->map({.offset=0,.length=info.allocationSize});
 					return retval;
 				}
 

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
@@ -1260,7 +1260,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 				auto retval = device->allocate(info);
 				// map what is mappable by default so ReBAR checks succeed
 				if (retval.isValid() && retval.memory->isMappable())
-					retval.memory->map({ .offset = 0,.length = info.size });
+					retval.memory->map({ .offset = 0,.length = info.allocationSize });
 				return retval;
 			}
 

diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt
@@ -0,0 +1,26 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+nbl_target_link_cuda_interop(${EXECUTABLE_NAME} PRIVATE)
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
diff --git a/76_CudaInterop/app_resources/vectorAdd_kernel.cu b/76_CudaInterop/app_resources/vectorAdd_kernel.cu
@@ -0,0 +1,40 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+
+extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
+                                     size_t numElements) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < numElements)
+    C[i] = A[i] + B[i];
+}
diff --git a/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu b/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu
@@ -0,0 +1,53 @@
+#include <mma.h>
+#include <cuda_runtime.h>
+
+using namespace nvcuda;
+
+// Define WMMA parameters
+const int WMMA_M = 8;
+const int WMMA_N = 8;
+const int WMMA_K = 128;
+
+extern "C" __global__ void b1_wmma_gemm_kernel(int* a, int* b, int* c, 
+                                    int M, int N, int K) {
+    // Leading dimensions
+    int lda = K; 
+    int ldb = K;
+    int ldc = N;
+
+    // Tile indices
+    int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+    int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
+
+    // Fragments
+    wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::row_major> a_frag;
+    wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::col_major> b_frag;
+    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> acc_frag;
+
+    // Initialize accumulator with zeros
+    wmma::fill_fragment(acc_frag, 0);
+
+    // Loop over the K-dimension
+    for (int i = 0; i < K; i += WMMA_K) {
+        int aRow = warpM * WMMA_M;
+        int aCol = i / 32; // Indexing uint32_t
+
+        int bRow = i / 32;
+        int bCol = warpN * WMMA_N;
+
+        // Load fragments
+        // Note: load_matrix_sync handles the bit-packing layout internally
+        wmma::load_matrix_sync(a_frag, a + (aRow * lda / 32 + aCol), lda);
+        wmma::load_matrix_sync(b_frag, b + (bCol * ldb / 32 + bRow), ldb);
+
+        // Perform XOR-Popcount MMA
+        wmma::bmma_sync(acc_frag, a_frag, b_frag, acc_frag, wmma::experimental::bmmaBitOpAND);
+    }
+
+    // Store the result
+    int cRow = warpM * WMMA_M;
+    int cCol = warpN * WMMA_N;
+    int* outputLoc = c + (cRow * ldc + cCol);
+    wmma::store_matrix_sync(outputLoc, acc_frag, ldc, wmma::mem_row_major);
+
+}