From 5eeb47351e29cb6ee02f8d8319f131a2c012b5a2 Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Thu, 16 Apr 2026 14:58:47 +0200
Subject: [PATCH 1/5] make NEE work in ex 31 with Global L solid angle sampling
 of spherical rect

---
 .../hlsl/next_event_estimator.hlsl            | 53 +++++++++++--------
 1 file changed, 31 insertions(+), 22 deletions(-)
diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index c8bee786c..29aca1824 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -246,6 +246,7 @@ template<typename T>
 struct ShapeSampling<T, PST_RECTANGLE, PPM_SOLID_ANGLE>
 {
     using scalar_type = T;
+    using vector2_type = vector<T, 2>;
     using vector3_type = vector<T, 3>;
 
     static ShapeSampling<T, PST_RECTANGLE, PPM_SOLID_ANGLE> create(NBL_CONST_REF_ARG(Shape<T, PST_RECTANGLE>) rect)
@@ -262,48 +263,56 @@ struct ShapeSampling<T, PST_RECTANGLE, PPM_SOLID_ANGLE>
         matrix<scalar_type, 3, 3> rectNormalBasis;
         vector<T, 2> rectExtents;
         rect.getNormalBasis(rectNormalBasis, rectExtents);
+
         shapes::SphericalRectangle<scalar_type> sphR0;
         sphR0.origin = rect.offset;
         sphR0.extents = rectExtents;
         sphR0.basis = rectNormalBasis;
-        scalar_type solidAngle = sphR0.solidAngle(ray.origin).value;
-        if (solidAngle > numeric_limits<scalar_type>::min)
-            pdf = 1.f / solidAngle;
-        else
-            pdf = bit_cast<scalar_type>(numeric_limits<scalar_type>::infinity);
-        return pdf;
+
+        // 1.f/0.f gives infinity no special checks needed
+        return 1.f / sphR0.solidAngle(ray.origin).value;
     }
 
     template<class Aniso>
     vector3_type generate_and_pdf(NBL_REF_ARG(scalar_type) pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(Aniso) interaction, NBL_CONST_REF_ARG(vector3_type) xi)
     {
-        const vector3_type N = rect.getNormalTimesArea();
-        const vector3_type origin2origin = rect.offset - origin;
-
         matrix<scalar_type, 3, 3> rectNormalBasis;
         vector<T, 2> rectExtents;
         rect.getNormalBasis(rectNormalBasis, rectExtents);
+
         shapes::SphericalRectangle<scalar_type> sphR0;
         sphR0.origin = rect.offset;
         sphR0.extents = rectExtents;
         sphR0.basis = rectNormalBasis;
-        vector3_type L = hlsl::promote<vector3_type>(0.0);
 
+        //
         sampling::SphericalRectangle<scalar_type> ssph = sampling::SphericalRectangle<scalar_type>::create(sphR0, origin);
-        if ( ssph.solidAngle > numeric_limits<scalar_type>::min)
+        typename sampling::SphericalRectangle<scalar_type>::cache_type cache;
+        
+        const vector3_type origin2origin = rect.offset - origin;
+        vector3_type L = hlsl::promote<vector3_type>(0.0);
+        const bool FastVersion = true;
+        if (FastVersion)
         {
-            typename sampling::SphericalRectangle<scalar_type>::cache_type cache;
-            const vector3_type localDir = ssph.generate(xi.xy, cache);
-            // not sure if generate() can produce NaN/inf when solidAngle > min
-            assert(!hlsl::any(hlsl::isinf(localDir) || hlsl::isnan(localDir)));
-            // transform local direction to world space
-            L = localDir.x * rectNormalBasis[0] + localDir.y * rectNormalBasis[1] + localDir.z * rectNormalBasis[2];
-            pdf = ssph.forwardPdf(xi.xy, cache);
+            // actually the slowest
+            //L = ssph.generate(xi.xy, cache);
+            //newRayMaxT = ssph.computeHitT(L);
+
+            // fastest
+            const vector3_type localL = ssph.generateNormalizedLocal(xi.xy,cache,newRayMaxT);
+            L = hlsl::mul(hlsl::transpose(ssph.basis),localL);
         }
         else
-            pdf = bit_cast<scalar_type>(numeric_limits<scalar_type>::infinity);
+        {
+            L = ssph.generateUnnormalized(xi.xy,cache);
+            const scalar_type rcpLen = hlsl::rsqrt(hlsl::dot(L,L));
+            newRayMaxT = 1.f / rcpLen;
+            L *= rcpLen;
+        }
+        // prevent self intersections against the emitter
+        newRayMaxT -= 0.0001f;
 
-        newRayMaxT = hlsl::dot<vector3_type>(N, origin2origin) / hlsl::dot<vector3_type>(N, L);
+        pdf = ssph.forwardPdf(xi.xy,cache);
         return L;
     }
 
@@ -322,7 +331,7 @@ struct EffectivePolygonMethod<PST_SPHERE, PPM>
     NBL_CONSTEXPR_STATIC_INLINE NEEPolygonMethod value = PPM_SOLID_ANGLE;
 };
 
-
+#if 0
 // Projected solid angle NEE for rectangles using "Practical Warps":
 // bilinear warp over 4-corner NdotL + spherical rectangle sampling.
 // Same grazing-angle limitations as the triangle variant -- see comments
@@ -398,7 +407,7 @@ struct ShapeSampling<T, PST_RECTANGLE, PPM_APPROX_PROJECTED_SOLID_ANGLE>
 
     Shape<T, PST_RECTANGLE> rect;
 };
-
+#endif
 
 template<class Scene, class Light, typename Ray, class LightSample, class Aniso, ProceduralShapeType PST, NEEPolygonMethod PPM = PPM_APPROX_PROJECTED_SOLID_ANGLE>
 struct NextEventEstimator

From 89ecce14443c216b30ff84b837b899045bb5513f Mon Sep 17 00:00:00 2001
From: devshgraphicsprogramming <devsh@devsh.eu>
Date: Fri, 17 Apr 2026 03:26:32 +0200
Subject: [PATCH 2/5] prep for rendering with PSA rectangle

---
 .../hlsl/next_event_estimator.hlsl            | 54 +++++++++----------
 1 file changed, 26 insertions(+), 28 deletions(-)

diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
index 29aca1824..91d2a2d5e 100644
--- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
+++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl
@@ -177,9 +177,7 @@ struct ShapeSampling<T, PST_TRIANGLE, PPM_APPROX_PROJECTED_SOLID_ANGLE>
         const vector3_type tri_vertices[3] = {tri.vertex0, tri.vertex1, tri.vertex2};
         shapes::SphericalTriangle<scalar_type> st = shapes::SphericalTriangle<scalar_type>::create(tri_vertices, ray.origin);
         sampling::ProjectedSphericalTriangle<scalar_type> pst = sampling::ProjectedSphericalTriangle<scalar_type>::create(st, ray.normalAtOrigin, ray.wasBSDFAtOrigin);
-        const scalar_type pdf = pst.backwardPdf(L);
-        // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small
-        return pdf < numeric_limits<scalar_type>::max ? pdf : numeric_limits<scalar_type>::max;
+        return pst.backwardWeight(L);
     }
 
     template<class Aniso>
@@ -331,7 +329,6 @@ struct EffectivePolygonMethod<PST_SPHERE, PPM>
     NBL_CONSTEXPR_STATIC_INLINE NEEPolygonMethod value = PPM_SOLID_ANGLE;
 };
 
-#if 0
 // Projected solid angle NEE for rectangles using "Practical Warps":
 // bilinear warp over 4-corner NdotL + spherical rectangle sampling.
 // Same grazing-angle limitations as the triangle variant -- see comments
@@ -361,21 +358,12 @@ struct ShapeSampling<T, PST_RECTANGLE, PPM_APPROX_PROJECTED_SOLID_ANGLE>
         sphR0.extents = rectExtents;
         sphR0.basis = rectNormalBasis;
         sampling::ProjectedSphericalRectangle<scalar_type> psr = sampling::ProjectedSphericalRectangle<scalar_type>::create(sphR0, ray.origin, ray.normalAtOrigin, ray.wasBSDFAtOrigin);
-        // Reconstruct normalized [0,1]^2 position on the rectangle from the ray direction
-        const vector3_type N = rect.getNormalTimesArea();
-        const scalar_type t = hlsl::dot<vector3_type>(N, rect.offset - ray.origin) / hlsl::dot<vector3_type>(N, ray.direction);
-        const vector3_type hitPoint = ray.origin + ray.direction * t;
-        const vector3_type localHit = hitPoint - rect.offset;
-        const vector<T, 2> p = vector<T, 2>(hlsl::dot(localHit, rectNormalBasis[0]) / rectExtents.x, hlsl::dot(localHit, rectNormalBasis[1]) / rectExtents.y);
-        const scalar_type pdf = psr.backwardPdf(p);
-        return pdf < numeric_limits<scalar_type>::max ? pdf : numeric_limits<scalar_type>::max;
+        return psr.backwardWeight(ray.direction);
     }
 
     template<class Aniso>
     vector3_type generate_and_pdf(NBL_REF_ARG(scalar_type) pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(Aniso) interaction, NBL_CONST_REF_ARG(vector3_type) xi)
     {
-        const vector3_type N = rect.getNormalTimesArea();
-        const vector3_type origin2origin = rect.offset - origin;
 
         matrix<scalar_type, 3, 3> rectNormalBasis;
         vector<T, 2> rectExtents;
@@ -384,30 +372,40 @@ struct ShapeSampling<T, PST_RECTANGLE, PPM_APPROX_PROJECTED_SOLID_ANGLE>
         sphR0.origin = rect.offset;
         sphR0.extents = rectExtents;
         sphR0.basis = rectNormalBasis;
-        vector3_type L = hlsl::promote<vector3_type>(0.0);
 
         sampling::ProjectedSphericalRectangle<scalar_type> psr = sampling::ProjectedSphericalRectangle<scalar_type>::create(sphR0, origin, interaction.getN(), interaction.isMaterialBSDF());
-        const scalar_type solidAngle = psr.sphrect.solidAngle;
-        if (solidAngle > numeric_limits<scalar_type>::min)
+        typename sampling::ProjectedSphericalRectangle<scalar_type>::cache_type cache;
+        
+        const vector3_type origin2origin = rect.offset - origin;
+        vector3_type L = hlsl::promote<vector3_type>(0.0);
+        const bool FastVersion = true;
+        if (FastVersion)
         {
-            typename sampling::ProjectedSphericalRectangle<scalar_type>::cache_type cache;
-            const vector3_type localDir = psr.generate(xi.xy, cache);
-            // not sure if generate() can produce NaN/inf when solidAngle > min
-            assert(!hlsl::any(hlsl::isinf(localDir) || hlsl::isnan(localDir)));
-            // transform local direction to world space
-            L = localDir.x * rectNormalBasis[0] + localDir.y * rectNormalBasis[1] + localDir.z * rectNormalBasis[2];
-            pdf = psr.forwardPdf(xi.xy, cache);
+            // actually the slowest
+            //L = psr.generate(xi.xy, cache);
+            //newRayMaxT = psr.sphrect.computeHitT(L);
+
+            // fastest
+            const vector3_type localL = psr.generateNormalizedLocal(xi.xy,cache,newRayMaxT);
+            // hopefully CSE kicks in for the `UsePdfAsWeight==true`
+            L = hlsl::mul(hlsl::transpose(psr.sphrect.basis),localL);
         }
         else
-            pdf = bit_cast<scalar_type>(numeric_limits<scalar_type>::infinity);
-
-        newRayMaxT = hlsl::dot<vector3_type>(N, origin2origin) / hlsl::dot<vector3_type>(N, L);
+        {
+            L = psr.generateUnnormalized(xi.xy,cache);
+            const scalar_type rcpLen = hlsl::rsqrt(hlsl::dot(L,L));
+            newRayMaxT = 1.f / rcpLen;
+            L *= rcpLen;
+        }
+        // prevent self intersections against the emitter
+        newRayMaxT -= 0.0001f;
+        
+        pdf = psr.forwardPdf(xi.xy,cache);
         return L;
     }
 
     Shape<T, PST_RECTANGLE> rect;
 };
-#endif
 
 template<class Scene, class Light, typename Ray, class LightSample, class Aniso, ProceduralShapeType PST, NEEPolygonMethod PPM = PPM_APPROX_PROJECTED_SOLID_ANGLE>
 struct NextEventEstimator

From fb5cfa2bcaa0a92aafb429f3d390658d28d1ca02 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 22 Apr 2026 01:16:12 +0300
Subject: [PATCH 3/5] jacobian tests, better benchmarks, addressed comments

---
 37_HLSLSamplingTests/CMakeLists.txt           | 172 ++++++-
 .../app_resources/common/alias_table.hlsl     |   2 +
 .../app_resources/common/array_accessor.hlsl  |   1 -
 .../app_resources/common/bilinear.hlsl        |   6 +
 .../common/box_muller_transform.hlsl          |   3 +
 .../common/concentric_mapping.hlsl            |  12 +-
 .../common/cumulative_probability.hlsl        |   2 +
 .../common/discrete_sampler_bench.hlsl        |   3 -
 .../app_resources/common/jacobian_test.hlsl   | 264 ++++++++++
 .../app_resources/common/linear.hlsl          |   3 +
 .../app_resources/common/polar_mapping.hlsl   |  20 +-
 .../common/projected_hemisphere.hlsl          |   8 +-
 .../common/projected_sphere.hlsl              |   3 +
 .../common/projected_spherical_rectangle.hlsl |  44 +-
 .../common/projected_spherical_triangle.hlsl  |  21 +-
 .../common/spherical_rectangle.hlsl           |  16 +-
 .../common/spherical_triangle.hlsl            |  17 +-
 .../common/uniform_hemisphere.hlsl            |   8 +-
 .../app_resources/common/uniform_sphere.hlsl  |   9 +-
 .../shaders/alias_table_test.comp.hlsl        |  11 +-
 .../shaders/bilinear_test.comp.hlsl           |  28 +-
 .../box_muller_transform_test.comp.hlsl       |  28 +-
 .../shaders/concentric_mapping_test.comp.hlsl |  20 +-
 .../cumulative_probability_test.comp.hlsl     |   6 +-
 .../shaders/linear_test.comp.hlsl             |  28 +-
 .../shaders/polar_mapping_test.comp.hlsl      |  20 +-
 .../projected_hemisphere_test.comp.hlsl       |  20 +-
 .../shaders/projected_sphere_test.comp.hlsl   |  20 +-
 ...ojected_spherical_rectangle_test.comp.hlsl |  54 +-
 ...rojected_spherical_triangle_test.comp.hlsl |  41 +-
 .../spherical_rectangle_test.comp.hlsl        | 102 +++-
 .../shaders/spherical_triangle.comp.hlsl      |  40 +-
 .../shaders/test_compile.comp.hlsl            |  78 ++-
 .../shaders/uniform_hemisphere_test.comp.hlsl |  20 +-
 .../shaders/uniform_sphere_test.comp.hlsl     |  20 +-
 .../benchmarks/CDiscreteSamplerBenchmark.h    | 391 +++++++-------
 .../benchmarks/CSamplerBenchmark.h            |   6 +-
 37_HLSLSamplingTests/main.cpp                 | 206 +++++---
 .../tests/CAliasTableGPUTester.h              |   1 +
 37_HLSLSamplingTests/tests/CBilinearTester.h  |   5 +-
 .../tests/CBoxMullerTransformTester.h         |   1 +
 .../tests/CConcentricMappingTester.h          |   3 +-
 .../tests/CCumulativeProbabilityGPUTester.h   |   1 +
 37_HLSLSamplingTests/tests/CLinearTester.h    |   9 +-
 .../tests/CPolarMappingTester.h               |   3 +-
 .../tests/CProjectedHemisphereTester.h        |   7 +-
 .../tests/CProjectedSphereTester.h            |   5 +-
 .../CProjectedSphericalRectangleTester.h      |  81 ++-
 .../tests/CProjectedSphericalTriangleTester.h |  30 +-
 .../tests/CSphericalRectangleTester.h         |  33 +-
 .../tests/CSphericalTriangleTester.h          |  13 +-
 .../tests/CUniformHemisphereTester.h          |   3 +-
 .../tests/CUniformSphereTester.h              |   3 +-
 .../tests/SamplerTestHelpers.h                | 482 ++++++++++++------
 .../tests/property/CSamplerPropertyTester.h   | 220 +++++---
 55 files changed, 1869 insertions(+), 784 deletions(-)
 create mode 100644 37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl

diff --git a/37_HLSLSamplingTests/CMakeLists.txt b/37_HLSLSamplingTests/CMakeLists.txt
index 2ac238c33..12cbb5bb1 100644
--- a/37_HLSLSamplingTests/CMakeLists.txt
+++ b/37_HLSLSamplingTests/CMakeLists.txt
@@ -113,8 +113,13 @@ set(JSON "
   },
   {
     \"INPUT\": \"app_resources/shaders/linear_test.comp.hlsl\",
-    \"KEY\": \"linear_bench\",
-    \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+    \"KEY\": \"linear_bench_1_1\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/linear_test.comp.hlsl\",
+    \"KEY\": \"linear_bench_1_16\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"]
   },
   {
     \"INPUT\": \"app_resources/shaders/uniform_hemisphere_test.comp.hlsl\",
@@ -122,8 +127,13 @@ set(JSON "
   },
   {
     \"INPUT\": \"app_resources/shaders/uniform_hemisphere_test.comp.hlsl\",
-    \"KEY\": \"uniform_hemisphere_bench\",
-    \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+    \"KEY\": \"uniform_hemisphere_bench_1_1\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/uniform_hemisphere_test.comp.hlsl\",
+    \"KEY\": \"uniform_hemisphere_bench_1_16\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"]
   },
   {
     \"INPUT\": \"app_resources/shaders/uniform_sphere_test.comp.hlsl\",
@@ -131,8 +141,13 @@ set(JSON "
   },
   {
     \"INPUT\": \"app_resources/shaders/uniform_sphere_test.comp.hlsl\",
-    \"KEY\": \"uniform_sphere_bench\",
-    \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+    \"KEY\": \"uniform_sphere_bench_1_1\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/uniform_sphere_test.comp.hlsl\",
+    \"KEY\": \"uniform_sphere_bench_1_16\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"]
   },
   {
     \"INPUT\": \"app_resources/shaders/projected_hemisphere_test.comp.hlsl\",
@@ -140,8 +155,13 @@ set(JSON "
   },
   {
     \"INPUT\": \"app_resources/shaders/projected_hemisphere_test.comp.hlsl\",
-    \"KEY\": \"projected_hemisphere_bench\",
-    \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+    \"KEY\": \"projected_hemisphere_bench_1_1\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/projected_hemisphere_test.comp.hlsl\",
+    \"KEY\": \"projected_hemisphere_bench_1_16\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"]
   },
   {
     \"INPUT\": \"app_resources/shaders/projected_sphere_test.comp.hlsl\",
@@ -149,8 +169,13 @@ set(JSON "
   },
   {
     \"INPUT\": \"app_resources/shaders/projected_sphere_test.comp.hlsl\",
-    \"KEY\": \"projected_sphere_bench\",
-    \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+    \"KEY\": \"projected_sphere_bench_1_1\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/projected_sphere_test.comp.hlsl\",
+    \"KEY\": \"projected_sphere_bench_1_16\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"]
   },
   {
     \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\",
@@ -158,8 +183,18 @@ set(JSON "
   },
   {
     \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\",
-    \"KEY\": \"spherical_triangle_bench\",
-    \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+    \"KEY\": \"spherical_triangle_bench_1_1\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\",
+    \"KEY\": \"spherical_triangle_bench_1_16\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\",
+    \"KEY\": \"spherical_triangle_bench_create_only\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"]
   },
   {
     \"INPUT\": \"app_resources/shaders/concentric_mapping_test.comp.hlsl\",
@@ -167,8 +202,13 @@ set(JSON "
   },
   {
     \"INPUT\": \"app_resources/shaders/concentric_mapping_test.comp.hlsl\",
-    \"KEY\": \"concentric_mapping_bench\",
-    \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+    \"KEY\": \"concentric_mapping_bench_1_1\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/concentric_mapping_test.comp.hlsl\",
+    \"KEY\": \"concentric_mapping_bench_1_16\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"]
   },
   {
     \"INPUT\": \"app_resources/shaders/polar_mapping_test.comp.hlsl\",
@@ -176,8 +216,13 @@ set(JSON "
   },
   {
     \"INPUT\": \"app_resources/shaders/polar_mapping_test.comp.hlsl\",
-    \"KEY\": \"polar_mapping_bench\",
-    \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+    \"KEY\": \"polar_mapping_bench_1_1\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/polar_mapping_test.comp.hlsl\",
+    \"KEY\": \"polar_mapping_bench_1_16\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"]
   },
   {
     \"INPUT\": \"app_resources/shaders/bilinear_test.comp.hlsl\",
@@ -185,8 +230,13 @@ set(JSON "
   },
   {
     \"INPUT\": \"app_resources/shaders/bilinear_test.comp.hlsl\",
-    \"KEY\": \"bilinear_bench\",
-    \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+    \"KEY\": \"bilinear_bench_1_1\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/bilinear_test.comp.hlsl\",
+    \"KEY\": \"bilinear_bench_1_16\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"]
   },
   {
     \"INPUT\": \"app_resources/shaders/box_muller_transform_test.comp.hlsl\",
@@ -194,8 +244,13 @@ set(JSON "
   },
   {
     \"INPUT\": \"app_resources/shaders/box_muller_transform_test.comp.hlsl\",
-    \"KEY\": \"box_muller_transform_bench\",
-    \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+    \"KEY\": \"box_muller_transform_bench_1_1\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/box_muller_transform_test.comp.hlsl\",
+    \"KEY\": \"box_muller_transform_bench_1_16\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"]
   },
   {
     \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\",
@@ -203,8 +258,18 @@ set(JSON "
   },
   {
     \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\",
-    \"KEY\": \"projected_spherical_triangle_bench\",
-    \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+    \"KEY\": \"projected_spherical_triangle_bench_1_1\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\",
+    \"KEY\": \"projected_spherical_triangle_bench_1_16\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\",
+    \"KEY\": \"projected_spherical_triangle_bench_create_only\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"]
   },
   {
     \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\",
@@ -212,8 +277,18 @@ set(JSON "
   },
   {
     \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\",
-    \"KEY\": \"projected_spherical_rectangle_bench\",
-    \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+    \"KEY\": \"projected_spherical_rectangle_bench_1_1\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\",
+    \"KEY\": \"projected_spherical_rectangle_bench_1_16\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\",
+    \"KEY\": \"projected_spherical_rectangle_bench_create_only\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"]
   },
   {
     \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\",
@@ -221,8 +296,48 @@ set(JSON "
   },
   {
     \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\",
-    \"KEY\": \"spherical_rectangle_bench\",
-    \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+    \"KEY\": \"spherical_rectangle_bench_1_1_shape_observer\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\",
+    \"KEY\": \"spherical_rectangle_bench_1_1_sa_extents\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\", \"-DBENCH_VARIANT_SA_EXTENTS\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\",
+    \"KEY\": \"spherical_rectangle_bench_1_1_r0_extents\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\", \"-DBENCH_VARIANT_R0_EXTENTS\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\",
+    \"KEY\": \"spherical_rectangle_bench_1_16_shape_observer\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\",
+    \"KEY\": \"spherical_rectangle_bench_1_16_sa_extents\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\", \"-DBENCH_VARIANT_SA_EXTENTS\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\",
+    \"KEY\": \"spherical_rectangle_bench_1_16_r0_extents\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\", \"-DBENCH_VARIANT_R0_EXTENTS\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\",
+    \"KEY\": \"spherical_rectangle_bench_create_only_shape_observer\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\",
+    \"KEY\": \"spherical_rectangle_bench_create_only_sa_extents\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\", \"-DBENCH_VARIANT_SA_EXTENTS\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\",
+    \"KEY\": \"spherical_rectangle_bench_create_only_r0_extents\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\", \"-DBENCH_VARIANT_R0_EXTENTS\"]
   },
   {
     \"INPUT\": \"app_resources/shaders/alias_table_test.comp.hlsl\",
@@ -241,6 +356,11 @@ set(JSON "
     \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\",
     \"KEY\": \"cumulative_probability_bench\",
     \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\",
+    \"KEY\": \"cumulative_probability_yolo_bench\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_CUMPROB_YOLO_READS\"]
   }
 ]
 ")
diff --git a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl
index da7048a1f..bb1ed54ef 100644
--- a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl
@@ -27,6 +27,7 @@ struct AliasTableTestResults
 	float32_t backwardPdf;
 	float32_t forwardWeight;
 	float32_t backwardWeight;
+	float32_t jacobianProduct;
 };
 
 // Pre-computed alias table for weights {1, 2, 3, 4}:
@@ -63,6 +64,7 @@ struct AliasTableTestExecutor
 		output.backwardPdf = sampler.backwardPdf(output.generatedIndex);
 		output.forwardWeight = sampler.forwardWeight(input.u, cache);
 		output.backwardWeight = sampler.backwardWeight(output.generatedIndex);
+		output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf;
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl b/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl
index 1f0a68195..5e679c98a 100644
--- a/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl
@@ -12,7 +12,6 @@ struct ArrayAccessor
 	using value_type = T;
 	template<typename V, typename I>
 	void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC { val = V(data[i]); }
-	T operator[](uint32_t i) NBL_CONST_MEMBER_FUNC { return data[i]; }
 	T data[N];
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl b/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl
index 64a13d3e1..752e547ce 100644
--- a/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl
@@ -3,6 +3,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/sampling/bilinear.hlsl>
+#include "jacobian_test.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -19,6 +20,7 @@ struct BilinearTestResults
 	float32_t forwardPdf;
 	float32_t forwardWeight;
 	float32_t backwardWeight;
+	float32_t jacobianProduct;
 };
 
 struct BilinearTestExecutor
@@ -37,6 +39,10 @@ struct BilinearTestExecutor
 			output.backwardPdf = sampler.backwardPdf(output.generated);
 			output.backwardWeight = sampler.backwardWeight(output.generated);
 		}
+		// marginFactor = 3: same reasoning as Linear; Bilinear is two Linear stages, so the skewed-
+		// coefficient inverse-CDF d^2/du^2 divergence near [0,1]^2 boundary applies on both axes.
+		output.jacobianProduct = computeJacobianProduct<JACOBIAN_PLAIN>(sampler, input.u, 1e-3f, 3.0f);
+
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl b/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl
index e8247e259..2b86e8560 100644
--- a/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl
@@ -3,6 +3,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/sampling/box_muller_transform.hlsl>
+#include "jacobian_test.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -21,6 +22,7 @@ struct BoxMullerTransformTestResults
 	float32_t forwardWeight;
 	float32_t backwardWeight;
 	float32_t2 separateBackwardPdf;
+	float32_t jacobianProduct;
 };
 
 struct BoxMullerTransformTestExecutor
@@ -40,6 +42,7 @@ struct BoxMullerTransformTestExecutor
 		output.backwardPdf = sampler.backwardPdf(output.generated);
 		output.backwardWeight = sampler.backwardWeight(output.generated);
 		output.separateBackwardPdf = sampler.separateBackwardPdf(output.generated);
+		output.jacobianProduct = computeJacobianProduct<JACOBIAN_PLAIN>(sampler, input.u, 1e-3f, 10.0f);
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl b/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl
index 67d8e5869..e0c6a570c 100644
--- a/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl
@@ -3,6 +3,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/sampling/concentric_mapping.hlsl>
+#include "jacobian_test.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -20,6 +21,7 @@ struct ConcentricMappingTestResults
 	float32_t forwardWeight;
 	float32_t backwardWeight;
 	float32_t jacobianProduct;
+	float32_t inverseJacobianPdf;
 	float32_t2 roundtripError;
 };
 
@@ -39,7 +41,15 @@ struct ConcentricMappingTestExecutor
 			output.backwardWeight = sampling::ConcentricMapping<float32_t>::backwardWeight(input.u);
 		}
 		output.roundtripError = nbl::hlsl::abs(input.u - output.inverted);
-		output.jacobianProduct = float32_t(1.0 / output.backwardPdf) * output.forwardPdf;	
+		{
+			sampling::ConcentricMapping<float32_t> sampler;
+			output.jacobianProduct = computeJacobianProduct<JACOBIAN_CONCENTRIC>(sampler, input.u, 1e-3f, 1.0f);
+			// Disk-center singularity: concentric atan2 blows up as r->0.
+			const float32_t diskRadius = nbl::hlsl::length(output.mapped);
+			output.inverseJacobianPdf = diskRadius < 0.1f
+				? JACOBIAN_SKIP_CODOMAIN_SINGULARITY
+				: computeInverseJacobianPdf(sampler, output.mapped, output.backwardPdf, 0.0f, 1e30f);
+		}
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl b/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl
index f58a22741..e66cb44fe 100644
--- a/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl
@@ -24,6 +24,7 @@ struct CumProbTestResults
 	float32_t backwardPdf;
 	float32_t forwardWeight;
 	float32_t backwardWeight;
+	float32_t jacobianProduct;
 };
 
 // Pre-computed CDF table for weights {1, 2, 3, 4}:
@@ -46,6 +47,7 @@ struct CumProbTestExecutor
 		output.backwardPdf = sampler.backwardPdf(output.generatedIndex);
 		output.forwardWeight = sampler.forwardWeight(input.u, cache);
 		output.backwardWeight = sampler.backwardWeight(output.generatedIndex);
+		output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf;
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl
index 9f1fec422..d5c1d313c 100644
--- a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl
@@ -5,9 +5,6 @@
 
 using namespace nbl::hlsl;
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 NBL_CONSTEXPR uint32_t WorkgroupSize = WORKGROUP_SIZE;
 
 struct AliasTablePushConstants
diff --git a/37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl b/37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl
new file mode 100644
index 000000000..f949f5b86
--- /dev/null
+++ b/37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl
@@ -0,0 +1,264 @@
+#ifndef _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_JACOBIAN_TEST_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_JACOBIAN_TEST_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/promote.hlsl>
+
+using namespace nbl::hlsl;
+
+// Negative sentinels signal "skipped" to the host verifier; the value encodes the reason.
+static const float32_t JACOBIAN_SKIP_U_DOMAIN             = -1.0f;
+static const float32_t JACOBIAN_SKIP_CREASE               = -2.0f;
+static const float32_t JACOBIAN_SKIP_HEMI_BOUNDARY        = -3.0f;
+static const float32_t JACOBIAN_SKIP_BWD_PDF_RANGE        = -4.0f;
+static const float32_t JACOBIAN_SKIP_CODOMAIN_SINGULARITY = -5.0f;
+
+
+template<typename Sampler, uint32_t DomainDim, uint32_t CodomainDim>
+struct ForwardJacobianMeasure;
+
+// Signed step that stays inside [0,1]: flip direction when u is in the upper half so u +/- eps
+// never overshoots the domain. Magnitude is what matters (the stencil results take abs/length).
+template<typename T>
+T signedEps(T u, T eps)
+{
+   return u > T(0.5) ? -eps : eps;
+}
+
+template<typename Sampler>
+struct ForwardJacobianMeasure<Sampler, 1, 1>
+{
+   using scalar_type   = typename Sampler::scalar_type;
+   using domain_type   = typename Sampler::domain_type;
+   using codomain_type = typename Sampler::codomain_type;
+   using cache_type    = typename Sampler::cache_type;
+
+   static scalar_type compute(Sampler _sampler, domain_type u, scalar_type eps, codomain_type L)
+   {
+      cache_type c;
+      const codomain_type L_x = _sampler.generate(u + signedEps<scalar_type>(u, eps), c);
+      return nbl::hlsl::abs<scalar_type>(L_x - L) / eps;
+   }
+};
+
+template<typename Sampler>
+struct ForwardJacobianMeasure<Sampler, 2, 2>
+{
+   using scalar_type   = typename Sampler::scalar_type;
+   using domain_type   = typename Sampler::domain_type;
+   using codomain_type = typename Sampler::codomain_type;
+   using cache_type    = typename Sampler::cache_type;
+
+   static scalar_type compute(Sampler _sampler, domain_type u, scalar_type eps, codomain_type L)
+   {
+      domain_type u_x = u;
+      u_x[0] += signedEps<scalar_type>(u[0], eps);
+      domain_type u_y = u;
+      u_y[1] += signedEps<scalar_type>(u[1], eps);
+      cache_type c;
+      const codomain_type L_x = _sampler.generate(u_x, c);
+      const codomain_type L_y = _sampler.generate(u_y, c);
+      using matrix2_type      = matrix<scalar_type, 2, 2>;
+      const scalar_type det   = nbl::hlsl::determinant<matrix2_type>(matrix2_type(L_x - L, L_y - L));
+      return nbl::hlsl::abs<scalar_type>(det) / (eps * eps);
+   }
+};
+
+template<typename Sampler>
+struct ForwardJacobianMeasure<Sampler, 2, 3>
+{
+   using scalar_type   = typename Sampler::scalar_type;
+   using domain_type   = typename Sampler::domain_type;
+   using codomain_type = typename Sampler::codomain_type;
+   using cache_type    = typename Sampler::cache_type;
+
+   static scalar_type compute(Sampler _sampler, domain_type u, scalar_type eps, codomain_type L)
+   {
+      domain_type u_x = u;
+      u_x[0] += signedEps<scalar_type>(u[0], eps);
+      domain_type u_y = u;
+      u_y[1] += signedEps<scalar_type>(u[1], eps);
+      cache_type c;
+      const codomain_type L_x = _sampler.generate(u_x, c);
+      const codomain_type L_y = _sampler.generate(u_y, c);
+      return nbl::hlsl::length(nbl::hlsl::cross(L_x - L, L_y - L)) / (eps * eps);
+   }
+};
+
+// 3D domain: stencil perturbs u[0] and u[1] only, so the (2,3) body applies unchanged.
+template<typename Sampler>
+struct ForwardJacobianMeasure<Sampler, 3, 3> : ForwardJacobianMeasure<Sampler, 2, 3>
+{
+};
+
+
+template<typename Sampler, uint32_t DomainDim>
+struct DomainMarginCheck;
+
+template<typename Sampler>
+struct DomainMarginCheck<Sampler, 1>
+{
+   using scalar_type = typename Sampler::scalar_type;
+   using domain_type = typename Sampler::domain_type;
+   static bool outsideMargin(domain_type u, scalar_type margin)
+   {
+      return u < margin || u > scalar_type(1) - margin;
+   }
+};
+
+template<typename Sampler>
+struct DomainMarginCheck<Sampler, 2>
+{
+   using scalar_type = typename Sampler::scalar_type;
+   using domain_type = typename Sampler::domain_type;
+   static bool outsideMargin(domain_type u, scalar_type margin)
+   {
+      return u[0] < margin || u[0] > scalar_type(1) - margin || u[1] < margin || u[1] > scalar_type(1) - margin;
+   }
+};
+
+// 3D domain: forward stencil only perturbs u[0] and u[1], so u[2] is irrelevant and (2) applies.
+template<typename Sampler>
+struct DomainMarginCheck<Sampler, 3> : DomainMarginCheck<Sampler, 2>
+{
+};
+
+enum JacobianMode : uint32_t
+{
+   JACOBIAN_PLAIN             = 0,
+   JACOBIAN_CONCENTRIC        = 1, // + concentric crease skip
+   JACOBIAN_CONCENTRIC_UXFOLD = 2  // + crease + u.x=0.5 hemi-boundary skip
+};
+
+// marginFactor scales the u-domain skip to marginFactor * eps. Use > 1 only for samplers whose
+// stencil bias extends past a single eps-step (e.g. Arvo spherical triangle: sinZ ~ sqrt(u.y)
+// gives O(h/u.y) forward-diff bias, so u.y in [0, k*eps] must be skipped).
+template<uint32_t Mode, typename Sampler>
+float32_t computeJacobianProduct(Sampler _sampler, typename Sampler::domain_type u, float32_t eps, float32_t marginFactor)
+{
+   using scalar_type   = typename Sampler::scalar_type;
+   using domain_type   = typename Sampler::domain_type;
+   using codomain_type = typename Sampler::codomain_type;
+   using cache_type    = typename Sampler::cache_type;
+
+   NBL_IF_CONSTEXPR(Mode != JACOBIAN_PLAIN)
+   {
+      // Cast via float32_t2 so this block typechecks for scalar / vec2 / vec3 domains alike
+      // (HLSL splats scalars, identity on vec2, .xy on vec3). 1D samplers never reach here.
+      const float32_t2 uxy = (float32_t2)u;
+      const float32_t ux   = uxy.x;
+      const float32_t uy   = uxy.y;
+
+      NBL_IF_CONSTEXPR(Mode == JACOBIAN_CONCENTRIC_UXFOLD)
+      {
+         if (nbl::hlsl::abs(ux - float32_t(0.5)) <= float32_t(2e-3))
+            return JACOBIAN_SKIP_HEMI_BOUNDARY;
+      }
+
+      const bool uxFold = (Mode == JACOBIAN_CONCENTRIC_UXFOLD);
+      // Empirical: the concentric C0 crease's stencil bias spreads wider than the 2*eps geometric
+      // straddle band. Non-uxFold 6e-3 covers the disk-center residual for Projected samplers;
+      // uxFold 1e-2 accounts for the doubled local_ux rate when u.x is folded.
+      const float32_t creaseBand = uxFold ? float32_t(1e-2) : float32_t(6e-3);
+      const float32_t local_ux   = uxFold ? nbl::hlsl::abs(float32_t(2) * ux - float32_t(1)) : ux;
+      const float32_t a          = float32_t(2) * local_ux - float32_t(1);
+      const float32_t b          = float32_t(2) * uy - float32_t(1);
+      if (nbl::hlsl::abs(nbl::hlsl::abs(a) - nbl::hlsl::abs(b)) <= creaseBand)
+         return JACOBIAN_SKIP_CREASE;
+   }
+
+   using margin_check_type = DomainMarginCheck<Sampler, vector_traits<domain_type>::Dimension>;
+   if (margin_check_type::outsideMargin(u, scalar_type(eps * marginFactor)))
+      return JACOBIAN_SKIP_U_DOMAIN;
+
+   // Generate on a copy: some samplers mutate u through NBL_REF_ARG (e.g. ProjectedSphere
+   // consumes u.z for hemisphere selection), and the perturbations below need the original u.
+   cache_type cache;
+   domain_type uGen      = u;
+   const codomain_type L = _sampler.generate(uGen, cache);
+   const scalar_type pdf = _sampler.forwardPdf(uGen, cache);
+
+   using measure_type        = ForwardJacobianMeasure<Sampler, vector_traits<domain_type>::Dimension, vector_traits<codomain_type>::Dimension>;
+   const scalar_type measure = measure_type::compute(_sampler, u, scalar_type(eps), L);
+
+   return pdf * measure;
+}
+
+
+template<typename Sampler, uint32_t DomainDim, uint32_t CodomainDim>
+struct InverseJacobianMeasure;
+
+template<typename Sampler>
+struct InverseJacobianMeasure<Sampler, 2, 2>
+{
+   using scalar_type   = typename Sampler::scalar_type;
+   using domain_type   = typename Sampler::domain_type;
+   using codomain_type = typename Sampler::codomain_type;
+
+   static scalar_type compute(Sampler _sampler, codomain_type x, scalar_type eps)
+   {
+      const scalar_type twoEps = scalar_type(2) * eps;
+      codomain_type x0_lo      = x;
+      x0_lo[0] -= eps;
+      codomain_type x0_hi = x;
+      x0_hi[0] += eps;
+      codomain_type x1_lo = x;
+      x1_lo[1] -= eps;
+      codomain_type x1_hi = x;
+      x1_hi[1] += eps;
+      domain_type u0_lo       = _sampler.generateInverse(x0_lo);
+      domain_type u0_hi       = _sampler.generateInverse(x0_hi);
+      domain_type u1_lo       = _sampler.generateInverse(x1_lo);
+      domain_type u1_hi       = _sampler.generateInverse(x1_hi);
+      const domain_type dudx0 = (u0_hi - u0_lo) / twoEps;
+      const domain_type dudx1 = (u1_hi - u1_lo) / twoEps;
+      using matrix2_type      = matrix<scalar_type, 2, 2>;
+      const scalar_type det   = nbl::hlsl::determinant<matrix2_type>(matrix2_type(dudx0, dudx1));
+      return nbl::hlsl::abs<scalar_type>(det);
+   }
+};
+
+template<typename Sampler>
+struct InverseJacobianMeasure<Sampler, 2, 3>
+{
+   using scalar_type   = typename Sampler::scalar_type;
+   using domain_type   = typename Sampler::domain_type;
+   using codomain_type = typename Sampler::codomain_type;
+
+   static scalar_type compute(Sampler _sampler, codomain_type x, scalar_type eps)
+   {
+      const scalar_type twoEps = scalar_type(2) * eps;
+      codomain_type t1, t2;
+      const codomain_type up  = nbl::hlsl::abs<scalar_type>(x[2]) < scalar_type(0.999)
+         ? codomain_type(scalar_type(0), scalar_type(0), scalar_type(1))
+         : codomain_type(scalar_type(1), scalar_type(0), scalar_type(0));
+      t1                      = nbl::hlsl::normalize(nbl::hlsl::cross(up, x));
+      t2                      = nbl::hlsl::cross(x, t1);
+      domain_type u_t1_lo     = _sampler.generateInverse(nbl::hlsl::normalize(x - t1 * eps));
+      domain_type u_t1_hi     = _sampler.generateInverse(nbl::hlsl::normalize(x + t1 * eps));
+      domain_type u_t2_lo     = _sampler.generateInverse(nbl::hlsl::normalize(x - t2 * eps));
+      domain_type u_t2_hi     = _sampler.generateInverse(nbl::hlsl::normalize(x + t2 * eps));
+      const domain_type dudt1 = (u_t1_hi - u_t1_lo) / twoEps;
+      const domain_type dudt2 = (u_t2_hi - u_t2_lo) / twoEps;
+      using matrix2_type      = matrix<scalar_type, 2, 2>;
+      const scalar_type det   = nbl::hlsl::determinant<matrix2_type>(matrix2_type(dudt1, dudt2));
+      return nbl::hlsl::abs<scalar_type>(det);
+   }
+};
+
+template<typename Sampler>
+float32_t computeInverseJacobianPdf(Sampler _sampler, typename Sampler::codomain_type sample, float32_t backwardPdf, float32_t pdfMin, float32_t pdfMax)
+{
+   using scalar_type   = typename Sampler::scalar_type;
+   using domain_type   = typename Sampler::domain_type;
+   using codomain_type = typename Sampler::codomain_type;
+
+   if (backwardPdf < scalar_type(pdfMin) || backwardPdf > scalar_type(pdfMax))
+      return JACOBIAN_SKIP_BWD_PDF_RANGE;
+
+   using measure_type    = InverseJacobianMeasure<Sampler, vector_traits<domain_type>::Dimension, vector_traits<codomain_type>::Dimension>;
+   const scalar_type eps = scalar_type(1e-3);
+   return measure_type::compute(_sampler, sample, eps);
+}
+
+#endif
diff --git a/37_HLSLSamplingTests/app_resources/common/linear.hlsl b/37_HLSLSamplingTests/app_resources/common/linear.hlsl
index b27d88e5b..af269ad2f 100644
--- a/37_HLSLSamplingTests/app_resources/common/linear.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/linear.hlsl
@@ -3,6 +3,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/sampling/linear.hlsl>
+#include "jacobian_test.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -19,6 +20,7 @@ struct LinearTestResults
 	float32_t backwardPdf;
 	float32_t forwardWeight;
 	float32_t backwardWeight;
+	float32_t jacobianProduct;
 };
 
 struct LinearTestExecutor
@@ -37,6 +39,7 @@ struct LinearTestExecutor
 			output.backwardPdf = _sampler.backwardPdf(output.generated);
 			output.backwardWeight = _sampler.backwardWeight(output.generated);
 		}
+		output.jacobianProduct = computeJacobianProduct<JACOBIAN_PLAIN>(_sampler, input.u, 1e-3f, 3.0f);
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl b/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl
index 82e020fdc..e4b8ffabb 100644
--- a/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl
@@ -3,6 +3,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/sampling/polar_mapping.hlsl>
+#include "jacobian_test.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -20,6 +21,7 @@ struct PolarMappingTestResults
 	float32_t forwardWeight;
 	float32_t backwardWeight;
 	float32_t jacobianProduct;
+	float32_t inverseJacobianPdf;
 	float32_t2 roundtripError;
 };
 
@@ -39,7 +41,23 @@ struct PolarMappingTestExecutor
 			output.backwardWeight = sampling::PolarMapping<float32_t>::backwardWeight(input.u);
 		}
 		output.roundtripError = nbl::hlsl::abs(input.u - output.inverted);
-		output.jacobianProduct = float32_t(1.0 / output.backwardPdf) * output.forwardPdf;
+
+		{
+			sampling::PolarMapping<float32_t> sampler;
+			// marginFactor = 3: r = sqrt(u.x) gives O(h/u.x) forward-diff bias near u.x=0, so skip
+			// u.x within 3*eps of the domain boundary (same reasoning as Linear's skewed-density case).
+			output.jacobianProduct = computeJacobianProduct<JACOBIAN_PLAIN>(sampler, input.u, 1e-3f, 3.0f);
+			// Two inverse singularities:
+			//  - disk center: atan2 diverges as r -> 0
+			//  - atan2 branch cut at y=0, x>0: the stencil's +/-eps in y straddles the 2*pi wrap,
+			//    producing du.y/eps ~ 1/eps spikes (seen as test values ~305-862 with eps=1e-3).
+			const float32_t polarRadius = nbl::hlsl::length(output.mapped);
+			const bool onCutBand = nbl::hlsl::abs(output.mapped.y) < 5e-3f && output.mapped.x > 0.0f;
+			output.inverseJacobianPdf = (polarRadius < 0.1f || onCutBand)
+				? JACOBIAN_SKIP_CODOMAIN_SINGULARITY
+				: computeInverseJacobianPdf(sampler, output.mapped, output.backwardPdf, 0.0f, 1e30f);
+		}
+
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl
index 9697cf0df..c48697b03 100644
--- a/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl
@@ -3,6 +3,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl>
+#include "jacobian_test.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -22,6 +23,7 @@ struct ProjectedHemisphereTestResults
 	float32_t backwardWeight;
 	float32_t2 roundtripError;
 	float32_t jacobianProduct;
+	float32_t inverseJacobianPdf;
 };
 
 struct ProjectedHemisphereTestExecutor
@@ -43,7 +45,11 @@ struct ProjectedHemisphereTestExecutor
 			output.backwardWeight = sampler.backwardWeight(output.generated);
 		}
 		output.roundtripError = nbl::hlsl::abs(input.u - output.inverted);
-		output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf;
+		output.jacobianProduct = computeJacobianProduct<JACOBIAN_CONCENTRIC>(sampler, input.u, 1e-3f, 5.0f);
+		const float32_t phDiskR = nbl::hlsl::length((float32_t2)output.generated);
+		output.inverseJacobianPdf = phDiskR < 0.1f
+			? JACOBIAN_SKIP_CODOMAIN_SINGULARITY
+			: computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 1e-3f, 1e30f);
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl
index e9886b61d..a78a937f6 100644
--- a/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl
@@ -3,6 +3,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl>
+#include "jacobian_test.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -20,6 +21,7 @@ struct ProjectedSphereTestResults
 	float32_t backwardPdf;
 	float32_t forwardWeight;
 	float32_t backwardWeight;
+	float32_t jacobianProduct;
 };
 
 struct ProjectedSphereTestExecutor
@@ -38,6 +40,7 @@ struct ProjectedSphereTestExecutor
 		}
 		output.backwardPdf = sampler.backwardPdf(output.generated);
 		output.backwardWeight = sampler.backwardWeight(output.generated);
+		output.jacobianProduct = computeJacobianProduct<JACOBIAN_CONCENTRIC>(sampler, input.u, 1e-3f, 5.0f);
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl
index 8370952ca..4aed7d9c3 100644
--- a/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl
@@ -4,6 +4,7 @@
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl>
 #include <nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl>
+#include "jacobian_test.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -24,12 +25,10 @@ struct ProjectedSphericalRectangleTestResults
 	float32_t2 surfaceOffset;
 	float32_t3 referenceDirection;
 	float32_t forwardPdf;
-	float32_t backwardPdf;
 	float32_t forwardWeight;
 	float32_t backwardWeight;
-	float32_t backwardPdfAtGenerated;
-	float32_t backwardWeightAtGenerated;
 	float32_t2 extents;
+	float32_t jacobianProduct;
 };
 
 struct ProjectedSphericalRectangleTestExecutor
@@ -46,30 +45,29 @@ struct ProjectedSphericalRectangleTestExecutor
 
 		output.extents = rect.extents;
 		sampling::ProjectedSphericalRectangle<float32_t>::cache_type cache;
+		output.generated = sampler.generate(input.u, cache);
+		output.forwardPdf = sampler.forwardPdf(input.u, cache);
+		output.forwardWeight = sampler.forwardWeight(input.u, cache);
+		// backwardWeight now takes a 3D direction; evaluate at generated L.
+		output.backwardWeight = sampler.backwardWeight(output.generated);
+
+		float32_t2 absXY;
 		{
-			output.generated = sampler.generate(input.u, cache);
-			output.forwardPdf = sampler.forwardPdf(input.u, cache);
-			output.forwardWeight = sampler.forwardWeight(input.u, cache);
-		}
-		{
-			sampling::ProjectedSphericalRectangle<float32_t>::cache_type offsetCache;
-			output.surfaceOffset = sampler.generateSurfaceOffset(input.u, offsetCache);
+			typename sampling::Bilinear<float32_t>::cache_type bc;
+			const float32_t2 warped = sampler.bilinearPatch.generate(input.u, bc);
+			typename sampling::SphericalRectangle<float32_t>::cache_type sphrectCache;
+			absXY = sampler.sphrect.generateLocalBasisXY(warped, sphrectCache);
+			output.surfaceOffset = absXY - float32_t2(sampler.sphrect.r0.x, sampler.sphrect.r0.y);
 		}
-		// reference direction: reconstruct local 3D point from surfaceOffset and normalize
 		{
-			const float32_t3 localPoint = sampler.sphrect.r0 + float32_t3(output.surfaceOffset.x, output.surfaceOffset.y, float32_t(0));
-			output.referenceDirection = nbl::hlsl::normalize(localPoint);
+			const float32_t3 localPoint = float32_t3(absXY.x, absXY.y, sampler.sphrect.r0.z);
+			const float32_t3 localDir = nbl::hlsl::normalize(localPoint);
+			output.referenceDirection = sampler.sphrect.basis[0] * localDir[0]
+			                          + sampler.sphrect.basis[1] * localDir[1]
+			                          + sampler.sphrect.basis[2] * localDir[2];
 		}
-		// Test backwardPdf/Weight at the rect center: a deterministic interior point
-		// that avoids amplifying generate's FP errors through backward evaluation.
-		const float32_t2 center = float32_t2(0.5, 0.5);
-		output.backwardPdf = sampler.backwardPdf(center);
-		output.backwardWeight = sampler.backwardWeight(center);
-		// Use cache.warped (the [0,1]^2 input to the spherical rect warp) for consistency
-		// checks, NOT generated/extents (the nonlinear warp output). The bilinear in
-		// forwardPdf evaluates at cache.warped, so backwardPdf must too.
-		output.backwardPdfAtGenerated = sampler.backwardPdf(cache.warped);
-		output.backwardWeightAtGenerated = sampler.backwardWeight(cache.warped);
+
+		output.jacobianProduct = computeJacobianProduct<JACOBIAN_PLAIN>(sampler, input.u, 1e-3f, 10.0f);
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl
index 5c81e53e0..0c424590b 100644
--- a/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl
@@ -4,6 +4,7 @@
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl>
 #include <nbl/builtin/hlsl/shapes/spherical_triangle.hlsl>
+#include "jacobian_test.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -21,11 +22,10 @@ struct ProjectedSphericalTriangleTestResults
 {
 	float32_t3 generated;
 	float32_t forwardPdf;
-	float32_t backwardPdf;
-	float32_t backwardPdfAtGenerated;
 	float32_t forwardWeight;
 	float32_t backwardWeight;
 	float32_t backwardWeightAtGenerated;
+	float32_t jacobianProduct;
 };
 
 struct ProjectedSphericalTriangleTestExecutor
@@ -43,15 +43,20 @@ struct ProjectedSphericalTriangleTestExecutor
 			output.forwardPdf = sampler.forwardPdf(input.u, cache);
 			output.forwardWeight = sampler.forwardWeight(input.u, cache);
 		}
-		// Test backwardPdf/Weight at the triangle centroid: a deterministic interior point computed
-		// from only basic arithmetic + sqrt (IEEE 754 exact), so CPU and GPU agree bit-exactly.
-		// Using output.generated would amplify generate's transcendental FP errors through
-		// generateInverse's acos, producing CPU/GPU divergence.
 		const float32_t3 center = nbl::hlsl::normalize(input.vertex0 + input.vertex1 + input.vertex2);
-		output.backwardPdf = sampler.backwardPdf(center);
 		output.backwardWeight = sampler.backwardWeight(center);
-		output.backwardPdfAtGenerated = sampler.backwardPdf(output.generated);
 		output.backwardWeightAtGenerated = sampler.backwardWeight(output.generated);
+		// Check the bilinear-warped (inner) u directly: for skinny triangles with a strongly biased
+		// receiver normal, outer u well inside [0,1] can still warp to inner u <~ 0.02 where Arvo's
+		// sqrt(sinZ) noise dominates. Pre-skip on the inner u instead of padding an outer marginFactor.
+		sampling::Bilinear<float32_t>::cache_type bc;
+		const float32_t2 innerU = sampler.bilinearPatch.generate(input.u, bc);
+		const float32_t innerMargin = 0.02f;
+		const bool innerNearEdge = innerU.x < innerMargin || innerU.x > (1.0f - innerMargin)
+		                        || innerU.y < innerMargin || innerU.y > (1.0f - innerMargin);
+		output.jacobianProduct = innerNearEdge
+			? JACOBIAN_SKIP_U_DOMAIN
+			: computeJacobianProduct<JACOBIAN_PLAIN>(sampler, input.u, 1e-3f, 1.0f);
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl b/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl
index 9ae4df256..4f8d20964 100644
--- a/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl
@@ -4,6 +4,7 @@
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl>
 #include <nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl>
+#include "jacobian_test.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -26,6 +27,7 @@ struct SphericalRectangleTestResults
 	float32_t forwardWeight;
 	float32_t backwardWeight;
 	float32_t2 extents;
+	float32_t jacobianProduct;
 };
 
 struct SphericalRectangleTestExecutor
@@ -47,17 +49,23 @@ struct SphericalRectangleTestExecutor
 			output.forwardPdf = sampler.forwardPdf(input.u, cache);
 			output.forwardWeight = sampler.forwardWeight(input.u, cache);
 		}
+		float32_t2 absXY;
 		{
 			sampling::SphericalRectangle<float32_t>::cache_type cache;
-			output.surfaceOffset = sampler.generateSurfaceOffset(input.u, cache);
+			absXY = sampler.generateLocalBasisXY(input.u, cache);
+			output.surfaceOffset = absXY - float32_t2(sampler.r0.x, sampler.r0.y);
 		}
-		// reference direction: reconstruct local 3D point from surfaceOffset and normalize
 		{
-			const float32_t3 localPoint = sampler.r0 + float32_t3(output.surfaceOffset.x, output.surfaceOffset.y, float32_t(0));
-			output.referenceDirection = nbl::hlsl::normalize(localPoint);
+			const float32_t3 localDir = nbl::hlsl::normalize(float32_t3(absXY.x, absXY.y, sampler.r0.z));
+			output.referenceDirection = sampler.basis[0] * localDir[0]
+			                          + sampler.basis[1] * localDir[1]
+			                          + sampler.basis[2] * localDir[2];
 		}
 		output.backwardPdf = sampler.backwardPdf(output.generated);
 		output.backwardWeight = sampler.backwardWeight(output.generated);
+		// marginFactor = 3: __generate's sin_au denominator goes through catastrophic cancellation
+		// for u.x within ~2*eps of 0 or 1 (au near n*pi), leaving ~0.5% residual at factor 3.
+		output.jacobianProduct = computeJacobianProduct<JACOBIAN_PLAIN>(sampler, input.u, 1e-3f, 3.0f);
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl
index 291661629..1828139d4 100644
--- a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl
@@ -3,6 +3,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/sampling/spherical_triangle.hlsl>
+#include "jacobian_test.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -24,6 +25,7 @@ struct SphericalTriangleTestResults
 	float32_t backwardWeight;
 	float32_t2 roundtripError;
 	float32_t jacobianProduct;
+	float32_t inverseJacobianPdf;
 	// Minimum signed distance to a triangle edge (sin of angular distance to nearest great circle).
 	// Positive = inside, negative = outside. Allows tolerance at boundaries.
 	float32_t generatedInside;
@@ -39,7 +41,7 @@ struct SphericalTriangleTestExecutor
 		const float32_t3 verts[3] = { input.vertex0, input.vertex1, input.vertex2 };
 		shapes::SphericalTriangle<float32_t> shape = shapes::SphericalTriangle<float32_t>::createFromUnitSphereVertices(verts);
 
-		sampling::SphericalTriangle<float32_t, true> sampler = sampling::SphericalTriangle<float32_t, true>::create(shape);
+		sampling::SphericalTriangle<float32_t> sampler = sampling::SphericalTriangle<float32_t>::create(shape);
 
 		// Forward: u -> v
 		{
@@ -57,10 +59,8 @@ struct SphericalTriangleTestExecutor
 			output.backwardWeight = sampler.backwardWeight(output.generated);
 		}
 		// Roundtrip error: ||u - u'||
-		output.roundtripError = nbl::hlsl::abs(input.u - output.inverted);
-
-		// Jacobian product: (1/forwardPdf) * backwardPdf should equal 1 for bijective samplers
-		output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf;
+		output.roundtripError = nbl::hlsl::abs(input.u - output.inverted);.
+		output.jacobianProduct = computeJacobianProduct<JACOBIAN_PLAIN>(sampler, input.u, 1e-3f, 20.0f);
 
 		// Domain preservation:
 		// A point is inside the spherical triangle iff it is on the "inside" half-plane
@@ -79,6 +79,13 @@ struct SphericalTriangleTestExecutor
 
 		float32_t2 u = output.inverted;
 		output.invertedInDomain = nbl::hlsl::min(nbl::hlsl::min(u.x, float32_t(1.0) - u.x), nbl::hlsl::min(u.y, float32_t(1.0) - u.y));
+
+		const float32_t uMargin = 1e-2f;
+		const bool nearUBoundary = output.inverted.x < uMargin || output.inverted.x > (1.0f - uMargin)
+		                        || output.inverted.y < uMargin || output.inverted.y > (1.0f - uMargin);
+		output.inverseJacobianPdf = nearUBoundary
+			? JACOBIAN_SKIP_CODOMAIN_SINGULARITY
+			: computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 0.1f, 10.0f);
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl b/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl
index 76a724774..fb51838c7 100644
--- a/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl
@@ -3,6 +3,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/sampling/uniform_spheres.hlsl>
+#include "jacobian_test.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -22,6 +23,7 @@ struct UniformHemisphereTestResults
 	float32_t backwardWeight;
 	float32_t2 roundtripError;
 	float32_t jacobianProduct;
+	float32_t inverseJacobianPdf;
 };
 
 struct UniformHemisphereTestExecutor
@@ -42,7 +44,11 @@ struct UniformHemisphereTestExecutor
 			output.backwardWeight = sampler.backwardWeight(output.generated);
 		}
 		output.roundtripError = nbl::hlsl::abs(input.u - output.inverted);
-		output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf;
+		output.jacobianProduct = computeJacobianProduct<JACOBIAN_CONCENTRIC>(sampler, input.u, 1e-3f, 1.0f);
+		const float32_t uhDiskR = nbl::hlsl::length((float32_t2)output.generated);
+		output.inverseJacobianPdf = uhDiskR < 0.1f
+			? JACOBIAN_SKIP_CODOMAIN_SINGULARITY
+			: computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 0.0f, 1e30f);
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl b/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl
index 3780b82ef..3737f4575 100644
--- a/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl
@@ -3,6 +3,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/sampling/uniform_spheres.hlsl>
+#include "jacobian_test.hlsl"
 
 using namespace nbl::hlsl;
 
@@ -22,6 +23,7 @@ struct UniformSphereTestResults
 	float32_t backwardWeight;
 	float32_t2 roundtripError;
 	float32_t jacobianProduct;
+	float32_t inverseJacobianPdf;
 };
 
 struct UniformSphereTestExecutor
@@ -43,7 +45,12 @@ struct UniformSphereTestExecutor
 			output.backwardWeight = sampler.backwardWeight(output.generated);
 		}
 		output.roundtripError = nbl::hlsl::abs(input.u - output.inverted);
-		output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf;
+		output.jacobianProduct = computeJacobianProduct<JACOBIAN_CONCENTRIC_UXFOLD>(sampler, input.u, 1e-3f, 1.0f);
+		const float32_t usDiskR = nbl::hlsl::length((float32_t2)output.generated);
+		const float32_t absZ    = nbl::hlsl::abs(output.generated.z);
+		output.inverseJacobianPdf = (absZ < 0.1f || usDiskR < 0.1f)
+			? JACOBIAN_SKIP_CODOMAIN_SINGULARITY
+			: computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 0.0f, 1e30f);
 	}
 };
 
diff --git a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl
index 72c4f1977..67047f997 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl
@@ -58,18 +58,15 @@ void main()
 	float32_t xi = float32_t(nbl::hlsl::glsl::bitfieldReverse(invID)) / float32_t(~0u);
 	NBL_CONSTEXPR float32_t goldenRatio = 0.6180339887498949f;
 	uint32_t acc = 0u;
-	uint32_t accPdf = 0u;
-
 	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
 	{
-		float32_t u = frac(xi + float32_t(i) * goldenRatio);
+		xi = frac(xi + goldenRatio);
 		BenchAliasTable::cache_type cache;
-		uint32_t generated = sampler.generate(u, cache);
-		acc ^= generated;
-		accPdf ^= asuint(sampler.forwardPdf(u, cache));
+		uint32_t generated = sampler.generate(xi, cache);
+		acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache));
 	}
 
-	vk::RawBufferStore<uint32_t>(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc + accPdf);
+	vk::RawBufferStore<uint32_t>(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc);
 #else
 	AliasTableTestExecutor executor;
 	executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl
index 06aad4fdc..03ac7b36a 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl
@@ -11,6 +11,10 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<BilinearTestResults> outputTestValues;
 #endif
 
+#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS)
+#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
+#endif
+
 #ifndef WORKGROUP_SIZE
 #define WORKGROUP_SIZE 64
 #endif
@@ -20,20 +24,24 @@ void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
 #ifdef BENCH_ITERS
-	// Perturb coefficients by invID so the sampler is non-uniform across threads.
-	const float32_t perturbation = float32_t(invID) * 1.0e-7f;
-	const float32_t4 coeffs = float32_t4(0.25f, 0.5f, 0.75f, 1.0f) + perturbation;
-	sampling::Bilinear<float32_t> sampler = sampling::Bilinear<float32_t>::create(coeffs);
+	const float32_t perturbationBase = float32_t(invID) * 1.0e-7f;
 	nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u));
 	const float32_t toFloat = asfloat(0x2f800004u);
 	uint32_t acc = 0u;
-	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
+	const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE);
+	for (uint32_t j = 0u; j < outerIters; j++)
 	{
-		float32_t2 u = float32_t2(rng(), rng()) * toFloat;
-		sampling::Bilinear<float32_t>::cache_type cache;
-		float32_t2 generated = sampler.generate(u, cache);
-		acc ^= asuint(generated.x) ^ asuint(generated.y);
-		acc ^= asuint(sampler.forwardPdf(u, cache));
+		const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f;
+		const float32_t4 coeffs = float32_t4(0.25f, 0.5f, 0.75f, 1.0f) + perturbation;
+		sampling::Bilinear<float32_t> sampler = sampling::Bilinear<float32_t>::create(coeffs);
+		for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++)
+		{
+			float32_t2 u = float32_t2(rng(), rng()) * toFloat;
+			sampling::Bilinear<float32_t>::cache_type cache;
+			float32_t2 generated = sampler.generate(u, cache);
+			acc ^= asuint(generated.x) ^ asuint(generated.y);
+			acc ^= asuint(sampler.forwardPdf(u, cache));
+		}
 	}
 	benchOutput.Store(invID * 4u, acc);
 #else
diff --git a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl
index cf0f4065a..6189d4658 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl
@@ -11,6 +11,10 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<BoxMullerTransformTestResults> outputTestValues;
 #endif
 
+#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS)
+#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
+#endif
+
 #ifndef WORKGROUP_SIZE
 #define WORKGROUP_SIZE 64
 #endif
@@ -20,20 +24,24 @@ void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
 #ifdef BENCH_ITERS
-	// Perturb stddev by invID so the sampler is non-uniform across threads.
-	const float32_t perturbation = float32_t(invID) * 1.0e-7f;
-	sampling::BoxMullerTransform<float32_t> sampler = sampling::BoxMullerTransform<float32_t>::create(1.0f + perturbation);
+	const float32_t perturbationBase = float32_t(invID) * 1.0e-7f;
 	nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u));
 	const float32_t toFloat = asfloat(0x2f800004u);
 	uint32_t acc = 0u;
-	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
+	const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE);
+	for (uint32_t j = 0u; j < outerIters; j++)
 	{
-		float32_t2 u = float32_t2(rng(), rng()) * toFloat;
-		u.x = max(u.x, 1e-7f);
-		sampling::BoxMullerTransform<float32_t>::cache_type cache;
-		float32_t2 generated = sampler.generate(u, cache);
-		acc ^= asuint(generated.x) ^ asuint(generated.y);
-		acc ^= asuint(sampler.forwardPdf(u, cache));
+		const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f;
+		sampling::BoxMullerTransform<float32_t> sampler = sampling::BoxMullerTransform<float32_t>::create(1.0f + perturbation);
+		for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++)
+		{
+			float32_t2 u = float32_t2(rng(), rng()) * toFloat;
+			u.x = max(u.x, 1e-7f);
+			sampling::BoxMullerTransform<float32_t>::cache_type cache;
+			float32_t2 generated = sampler.generate(u, cache);
+			acc ^= asuint(generated.x) ^ asuint(generated.y);
+			acc ^= asuint(sampler.forwardPdf(u, cache));
+		}
 	}
 	benchOutput.Store(invID * 4u, acc);
 #else
diff --git a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl
index 973aba4fe..649c323b2 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl
@@ -11,6 +11,10 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<ConcentricMappingTestResults> outputTestValues;
 #endif
 
+#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS)
+#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
+#endif
+
 #ifndef WORKGROUP_SIZE
 #define WORKGROUP_SIZE 64
 #endif
@@ -23,13 +27,17 @@ void main()
 	nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u));
 	const float32_t toFloat = asfloat(0x2f800004u);
 	uint32_t acc = 0u;
-	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
+	const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE);
+	for (uint32_t j = 0u; j < outerIters; j++)
 	{
-		float32_t2 u = float32_t2(rng(), rng()) * toFloat;
-		sampling::ConcentricMapping<float32_t>::cache_type cache;
-		float32_t2 generated = sampling::ConcentricMapping<float32_t>::generate(u, cache);
-		acc ^= asuint(generated.x) ^ asuint(generated.y);
-		acc ^= asuint(sampling::ConcentricMapping<float32_t>::forwardPdf(generated, cache));
+		for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++)
+		{
+			float32_t2 u = float32_t2(rng(), rng()) * toFloat;
+			sampling::ConcentricMapping<float32_t>::cache_type cache;
+			float32_t2 generated = sampling::ConcentricMapping<float32_t>::generate(u, cache);
+			acc ^= asuint(generated.x) ^ asuint(generated.y);
+			acc ^= asuint(sampling::ConcentricMapping<float32_t>::forwardPdf(generated, cache));
+		}
 	}
 	benchOutput.Store(invID * 4u, acc);
 #else
diff --git a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl
index 2e48adc4a..1091ee447 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl
@@ -46,10 +46,10 @@ void main()
 
 	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
 	{
-		float32_t u = frac(xi + float32_t(i) * goldenRatio);
+		xi = frac(xi + goldenRatio);
 		BenchCumProbSampler::cache_type cache;
-		uint32_t generated = sampler.generate(u, cache);
-		acc ^= generated ^ asuint(sampler.forwardPdf(u, cache));
+		uint32_t generated = sampler.generate(xi, cache);
+		acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache));
 	}
 
 	vk::RawBufferStore<uint32_t>(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl
index 614f339b4..17cf83ac5 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl
@@ -11,6 +11,10 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<LinearTestResults> outputTestValues;
 #endif
 
+#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS)
+#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
+#endif
+
 #ifndef WORKGROUP_SIZE
 #define WORKGROUP_SIZE 64
 #endif
@@ -20,20 +24,24 @@ void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
 #ifdef BENCH_ITERS
-	// Perturb coefficients by invID so the sampler is non-uniform across threads.
-	const float32_t perturbation = float32_t(invID) * 1.0e-7f;
-	const float32_t2 coeffs = float32_t2(0.2f, 0.8f) + perturbation;
-	sampling::Linear<float32_t> sampler = sampling::Linear<float32_t>::create(coeffs);
+	const float32_t perturbationBase = float32_t(invID) * 1.0e-7f;
 	nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u));
 	const float32_t toFloat = asfloat(0x2f800004u);
 	uint32_t acc = 0u;
-	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
+	const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE);
+	for (uint32_t j = 0u; j < outerIters; j++)
 	{
-		float32_t u = float32_t(rng()) * toFloat;
-		sampling::Linear<float32_t>::cache_type cache;
-		float32_t generated = sampler.generate(u, cache);
-		acc ^= asuint(generated);
-		acc ^= asuint(sampler.forwardPdf(u, cache));
+		const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f;
+		const float32_t2 coeffs = float32_t2(0.2f, 0.8f) + perturbation;
+		sampling::Linear<float32_t> sampler = sampling::Linear<float32_t>::create(coeffs);
+		for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++)
+		{
+			float32_t u = float32_t(rng()) * toFloat;
+			sampling::Linear<float32_t>::cache_type cache;
+			float32_t generated = sampler.generate(u, cache);
+			acc ^= asuint(generated);
+			acc ^= asuint(sampler.forwardPdf(u, cache));
+		}
 	}
 	benchOutput.Store(invID * 4u, acc);
 #else
diff --git a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl
index db7488acd..e0cf7aea0 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl
@@ -11,6 +11,10 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<PolarMappingTestResults> outputTestValues;
 #endif
 
+#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS)
+#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
+#endif
+
 #ifndef WORKGROUP_SIZE
 #define WORKGROUP_SIZE 64
 #endif
@@ -23,13 +27,17 @@ void main()
 	nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u));
 	const float32_t toFloat = asfloat(0x2f800004u);
 	uint32_t acc = 0u;
-	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
+	const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE);
+	for (uint32_t j = 0u; j < outerIters; j++)
 	{
-		float32_t2 u = float32_t2(rng(), rng()) * toFloat;
-		sampling::PolarMapping<float32_t>::cache_type cache;
-		float32_t2 generated = sampling::PolarMapping<float32_t>::generate(u, cache);
-		acc ^= asuint(generated.x) ^ asuint(generated.y);
-		acc ^= asuint(sampling::PolarMapping<float32_t>::forwardPdf(generated, cache));
+		for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++)
+		{
+			float32_t2 u = float32_t2(rng(), rng()) * toFloat;
+			sampling::PolarMapping<float32_t>::cache_type cache;
+			float32_t2 generated = sampling::PolarMapping<float32_t>::generate(u, cache);
+			acc ^= asuint(generated.x) ^ asuint(generated.y);
+			acc ^= asuint(sampling::PolarMapping<float32_t>::forwardPdf(generated, cache));
+		}
 	}
 	benchOutput.Store(invID * 4u, acc);
 #else
diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl
index 871444955..d1ef313e5 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl
@@ -11,6 +11,10 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<ProjectedHemisphereTestResults> outputTestValues;
 #endif
 
+#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS)
+#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
+#endif
+
 #ifndef WORKGROUP_SIZE
 #define WORKGROUP_SIZE 64
 #endif
@@ -23,14 +27,18 @@ void main()
 	nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u));
 	const float32_t toFloat = asfloat(0x2f800004u);
 	uint32_t acc = 0u;
-	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
+	const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE);
+	for (uint32_t j = 0u; j < outerIters; j++)
 	{
-		float32_t2 u = float32_t2(rng(), rng()) * toFloat;
 		sampling::ProjectedHemisphere<float32_t> sampler;
-		sampling::ProjectedHemisphere<float32_t>::cache_type cache;
-		float32_t3 generated = sampler.generate(u, cache);
-		acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
-		acc ^= asuint(sampler.forwardPdf(u, cache));
+		for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++)
+		{
+			float32_t2 u = float32_t2(rng(), rng()) * toFloat;
+			sampling::ProjectedHemisphere<float32_t>::cache_type cache;
+			float32_t3 generated = sampler.generate(u, cache);
+			acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
+			acc ^= asuint(sampler.forwardPdf(u, cache));
+		}
 	}
 	benchOutput.Store(invID * 4u, acc);
 #else
diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl
index 67a3fa662..9b8c234c4 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl
@@ -11,6 +11,10 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<ProjectedSphereTestResults> outputTestValues;
 #endif
 
+#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS)
+#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
+#endif
+
 #ifndef WORKGROUP_SIZE
 #define WORKGROUP_SIZE 64
 #endif
@@ -23,14 +27,18 @@ void main()
 	nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u));
 	const float32_t toFloat = asfloat(0x2f800004u);
 	uint32_t acc = 0u;
-	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
+	const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE);
+	for (uint32_t j = 0u; j < outerIters; j++)
 	{
-		float32_t3 u = float32_t3(rng(), rng(), rng()) * toFloat;
 		sampling::ProjectedSphere<float32_t> sampler;
-		sampling::ProjectedSphere<float32_t>::cache_type cache;
-		float32_t3 generated = sampler.generate(u, cache);
-		acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
-		acc ^= asuint(sampler.forwardPdf(u, cache));
+		for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++)
+		{
+			float32_t3 u = float32_t3(rng(), rng(), rng()) * toFloat;
+			sampling::ProjectedSphere<float32_t>::cache_type cache;
+			float32_t3 generated = sampler.generate(u, cache);
+			acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
+			acc ^= asuint(sampler.forwardPdf(u, cache));
+		}
 	}
 	benchOutput.Store(invID * 4u, acc);
 #else
diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl
index 903075804..ca9b4d43e 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl
@@ -11,6 +11,12 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<ProjectedSphericalRectangleTestResults> outputTestValues;
 #endif
 
+// Number of generate() calls per create(). Default = BENCH_ITERS (persistent: 1 create total).
+// Set to 1 for 1:1, 16 for 1:16 multisampling, etc. Must divide BENCH_ITERS.
+#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS)
+#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
+#endif
+
 #ifndef WORKGROUP_SIZE
 #define WORKGROUP_SIZE 64
 #endif
@@ -21,25 +27,49 @@ main()
    const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
 #ifdef BENCH_ITERS
    // Perturb rectangle origin by invID so the sampler is non-uniform across threads.
-   const float32_t perturbation = float32_t(invID) * 1.0e-7f;
-   shapes::CompressedSphericalRectangle<float32_t> compressed;
-   compressed.origin = float32_t3(perturbation, perturbation, -2.0f);
-   compressed.right = float32_t3(1.0f, 0.0f, 0.0f);
-   compressed.up = float32_t3(0.0f, 1.0f, 0.0f);
-   shapes::SphericalRectangle<float32_t> rect = shapes::SphericalRectangle<float32_t>::create(compressed);
-   sampling::ProjectedSphericalRectangle<float32_t> sampler = sampling::ProjectedSphericalRectangle<float32_t>::create(rect, float32_t3(perturbation, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, perturbation + 0.5), false);
+   const float32_t perturbationBase = float32_t(invID) * 1.0e-7f;
 
    nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u));
    const float32_t toFloat = asfloat(0x2f800004u);
    uint32_t acc = 0u;
+#ifdef BENCH_CREATE_ONLY
    for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
    {
-      float32_t2 u = float32_t2(rng(), rng()) * toFloat;
-      sampling::ProjectedSphericalRectangle<float32_t>::cache_type cache;
-      float32_t3 generated = sampler.generate(u, cache);
-      acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
-      acc ^= asuint(sampler.forwardPdf(u, cache));
+      // Depend on i so the compiler can't hoist create() out of the loop.
+      const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f;
+      shapes::CompressedSphericalRectangle<float32_t> compressed;
+      compressed.origin = float32_t3(perturbation, perturbation, -2.0f);
+      compressed.right = float32_t3(1.0f, 0.0f, 0.0f);
+      compressed.up = float32_t3(0.0f, 1.0f, 0.0f);
+      shapes::SphericalRectangle<float32_t> rect = shapes::SphericalRectangle<float32_t>::create(compressed);
+      sampling::ProjectedSphericalRectangle<float32_t> sampler = sampling::ProjectedSphericalRectangle<float32_t>::create(rect, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, perturbation + 0.5), false);
+      // Read a cheap function of sampler state so create() can't be elided.
+      sampling::ProjectedSphericalRectangle<float32_t>::cache_type pdfCache;
+      sampler.generate(float32_t2(0.5f, 0.5f), pdfCache);
+      acc ^= asuint(sampler.forwardPdf(float32_t2(0.5f, 0.5f), pdfCache));
+   }
+#else
+   // Unified create:generate loop — one create per BENCH_SAMPLES_PER_CREATE generates.
+   const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE);
+   for (uint32_t j = 0u; j < outerIters; j++)
+   {
+      const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f;
+      shapes::CompressedSphericalRectangle<float32_t> compressed;
+      compressed.origin = float32_t3(perturbation, perturbation, -2.0f);
+      compressed.right = float32_t3(1.0f, 0.0f, 0.0f);
+      compressed.up = float32_t3(0.0f, 1.0f, 0.0f);
+      shapes::SphericalRectangle<float32_t> rect = shapes::SphericalRectangle<float32_t>::create(compressed);
+      sampling::ProjectedSphericalRectangle<float32_t> sampler = sampling::ProjectedSphericalRectangle<float32_t>::create(rect, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, perturbation + 0.5), false);
+      for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++)
+      {
+         float32_t2 u = float32_t2(rng(), rng()) * toFloat;
+         sampling::ProjectedSphericalRectangle<float32_t>::cache_type cache;
+         float32_t3 generated = sampler.generate(u, cache);
+         acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
+         acc ^= asuint(sampler.forwardPdf(u, cache));
+      }
    }
+#endif
    benchOutput.Store(invID * 4u, acc);
 #else
    ProjectedSphericalRectangleTestExecutor executor;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl
index 83e47b3e1..3d8ec8961 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl
@@ -11,6 +11,10 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<ProjectedSphericalTriangleTestResults> outputTestValues;
 #endif
 
+#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS)
+#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
+#endif
+
 #ifndef WORKGROUP_SIZE
 #define WORKGROUP_SIZE 64
 #endif
@@ -20,23 +24,40 @@ void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
 #ifdef BENCH_ITERS
-	// Perturb vertices and normal by invID so the sampler is non-uniform across threads.
-	const float32_t perturbation = float32_t(invID) * 1.0e-7f;
-	const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) };
-	shapes::SphericalTriangle<float32_t> shape = shapes::SphericalTriangle<float32_t>::createFromUnitSphereVertices(verts);
-	sampling::ProjectedSphericalTriangle<float32_t> sampler = sampling::ProjectedSphericalTriangle<float32_t>::create(shape, normalize(float32_t3(perturbation, perturbation, 1.0f)), false);
+	const float32_t perturbationBase = float32_t(invID) * 1.0e-7f;
 
 	nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u));
 	const float32_t toFloat = asfloat(0x2f800004u);
 	uint32_t acc = 0u;
+#ifdef BENCH_CREATE_ONLY
 	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
 	{
-		float32_t2 u = float32_t2(rng(), rng()) * toFloat;
-		sampling::ProjectedSphericalTriangle<float32_t>::cache_type cache;
-		float32_t3 generated = sampler.generate(u, cache);
-		acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
-		acc ^= asuint(sampler.forwardPdf(u, cache));
+		const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f;
+		const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) };
+		shapes::SphericalTriangle<float32_t> shape = shapes::SphericalTriangle<float32_t>::createFromUnitSphereVertices(verts);
+		sampling::ProjectedSphericalTriangle<float32_t> sampler = sampling::ProjectedSphericalTriangle<float32_t>::create(shape, normalize(float32_t3(perturbation, perturbation, 1.0f)), false);
+		sampling::ProjectedSphericalTriangle<float32_t>::cache_type pdfCache;
+		sampler.generate(float32_t2(0.5f, 0.5f), pdfCache);
+		acc ^= asuint(sampler.forwardPdf(float32_t2(0.5f, 0.5f), pdfCache));
+	}
+#else
+	const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE);
+	for (uint32_t j = 0u; j < outerIters; j++)
+	{
+		const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f;
+		const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) };
+		shapes::SphericalTriangle<float32_t> shape = shapes::SphericalTriangle<float32_t>::createFromUnitSphereVertices(verts);
+		sampling::ProjectedSphericalTriangle<float32_t> sampler = sampling::ProjectedSphericalTriangle<float32_t>::create(shape, normalize(float32_t3(perturbation, perturbation, 1.0f)), false);
+		for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++)
+		{
+			float32_t2 u = float32_t2(rng(), rng()) * toFloat;
+			sampling::ProjectedSphericalTriangle<float32_t>::cache_type cache;
+			float32_t3 generated = sampler.generate(u, cache);
+			acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
+			acc ^= asuint(sampler.forwardPdf(u, cache));
+		}
 	}
+#endif
 	benchOutput.Store(invID * 4u, acc);
 #else
 	ProjectedSphericalTriangleTestExecutor executor;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl
index 3e9a6fcae..b9766d5ff 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl
@@ -11,6 +11,12 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<SphericalRectangleTestResults> outputTestValues;
 #endif
 
+// Number of generate() calls per create(). Default = BENCH_ITERS (persistent: 1 create total).
+// Set to 1 for 1:1 (create+generate per iter), 16 for 1:16 multisampling, etc. Must divide BENCH_ITERS.
+#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS)
+#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
+#endif
+
 #ifndef WORKGROUP_SIZE
 #define WORKGROUP_SIZE 64
 #endif
@@ -20,26 +26,96 @@ main()
 {
    const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
 #ifdef BENCH_ITERS
-   // Perturb rectangle origin by invID so the sampler is non-uniform across threads.
-   const float32_t perturbation = float32_t(invID) * 1.0e-7f;
-   shapes::CompressedSphericalRectangle<float32_t> compressed;
-   compressed.origin = float32_t3(perturbation, perturbation, -2.0f);
-   compressed.right = float32_t3(1.0f, 0.0f, 0.0f);
-   compressed.up = float32_t3(0.0f, 1.0f, 0.0f);
-   shapes::SphericalRectangle<float32_t> rect = shapes::SphericalRectangle<float32_t>::create(compressed);
-   sampling::SphericalRectangle<float32_t> sampler = sampling::SphericalRectangle<float32_t>::create(rect, float32_t3(perturbation, 0.0f, 0.0f));
+   // Observer at origin so origin - observer = (p, p, -2) has no zero components:
+   // keeps all 4 denorm_n_z components perturbation-dependent (no constant-folding).
+   const float32_t perturbationBase = float32_t(invID) * 1.0e-7f;
+
+#if (defined(BENCH_VARIANT_SA_EXTENTS) || defined(BENCH_VARIANT_R0_EXTENTS)) && !defined(BENCH_CREATE_ONLY)
+   // variants 2/3 pre-build: produce a rect (for its basis, sa, extents) once per thread.
+   shapes::CompressedSphericalRectangle<float32_t> compressedBase;
+   compressedBase.origin = float32_t3(perturbationBase, perturbationBase, -2.0f);
+   compressedBase.right = float32_t3(1.0f, 0.0f, 0.0f);
+   compressedBase.up = float32_t3(0.0f, 1.0f, 0.0f);
+   const shapes::SphericalRectangle<float32_t> rectBase = shapes::SphericalRectangle<float32_t>::create(compressedBase);
+   const typename shapes::SphericalRectangle<float32_t>::solid_angle_type saBase = rectBase.solidAngle(float32_t3(0.0f, 0.0f, 0.0f));
+   const float32_t2 extentsBase = rectBase.extents;
+   const matrix<float32_t, 3, 3> basisBase = rectBase.basis;
+#endif
 
    nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u));
    const float32_t toFloat = asfloat(0x2f800004u);
    uint32_t acc = 0u;
+#ifdef BENCH_CREATE_ONLY
    for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
    {
-      float32_t2 u = float32_t2(rng(), rng()) * toFloat;
-      sampling::SphericalRectangle<float32_t>::cache_type cache;
-      float32_t3 generated = sampler.generate(u, cache);
-      acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
-      acc ^= asuint(sampler.forwardPdf(u, cache));
+      // Depend on i so the compiler can't hoist create() out of the loop.
+      const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f;
+      sampling::SphericalRectangle<float32_t> sampler;
+  #if defined(BENCH_VARIANT_SA_EXTENTS)
+      shapes::CompressedSphericalRectangle<float32_t> compressed;
+      compressed.origin = float32_t3(perturbation, perturbation, -2.0f);
+      compressed.right = float32_t3(1.0f, 0.0f, 0.0f);
+      compressed.up = float32_t3(0.0f, 1.0f, 0.0f);
+      shapes::SphericalRectangle<float32_t> rect = shapes::SphericalRectangle<float32_t>::create(compressed);
+      typename shapes::SphericalRectangle<float32_t>::solid_angle_type sa = rect.solidAngle(float32_t3(0.0f, 0.0f, 0.0f));
+      sampler = sampling::SphericalRectangle<float32_t>::create(rect.basis, sa, rect.extents);
+  #elif defined(BENCH_VARIANT_R0_EXTENTS)
+      // Build a basis from the same rect geometry so create(basis, r0, extents) has the right frame.
+      shapes::CompressedSphericalRectangle<float32_t> compressedR0;
+      compressedR0.origin = float32_t3(perturbation, perturbation, -2.0f);
+      compressedR0.right = float32_t3(1.0f, 0.0f, 0.0f);
+      compressedR0.up = float32_t3(0.0f, 1.0f, 0.0f);
+      const shapes::SphericalRectangle<float32_t> rectR0 = shapes::SphericalRectangle<float32_t>::create(compressedR0);
+      const float32_t3 r0 = float32_t3(perturbation, perturbation, -2.0f);
+      const float32_t2 extents = float32_t2(1.0f, 1.0f);
+      sampler = sampling::SphericalRectangle<float32_t>::create(rectR0.basis, r0, extents);
+  #else
+      shapes::CompressedSphericalRectangle<float32_t> compressed;
+      compressed.origin = float32_t3(perturbation, perturbation, -2.0f);
+      compressed.right = float32_t3(1.0f, 0.0f, 0.0f);
+      compressed.up = float32_t3(0.0f, 1.0f, 0.0f);
+      shapes::SphericalRectangle<float32_t> rect = shapes::SphericalRectangle<float32_t>::create(compressed);
+      sampler = sampling::SphericalRectangle<float32_t>::create(rect, float32_t3(0.0f, 0.0f, 0.0f));
+  #endif
+      // Read a cheap function of sampler state so create() can't be elided.
+      acc ^= asuint(sampler.backwardPdf(float32_t3(0.0f, 0.0f, 1.0f)));
    }
+#else
+   // Unified create:generate loop - one create per BENCH_SAMPLES_PER_CREATE generates.
+   const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE);
+   for (uint32_t j = 0u; j < outerIters; j++)
+   {
+      const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f;
+      sampling::SphericalRectangle<float32_t> sampler;
+  #if defined(BENCH_VARIANT_SA_EXTENTS)
+      // variant 2: create(basis, sa, extents). Poison one cosGamma so the sincos_accumulator can't be hoisted.
+      typename shapes::SphericalRectangle<float32_t>::solid_angle_type sa = saBase;
+      sa.cosGamma[2] += perturbation;
+      sampler = sampling::SphericalRectangle<float32_t>::create(basisBase, sa, extentsBase);
+  #elif defined(BENCH_VARIANT_R0_EXTENTS)
+      // variant 3: create(basis, r0, extents). r0 matches what variant 1 produces.
+      const float32_t3 r0 = float32_t3(perturbation, perturbation, -2.0f);
+      const float32_t2 extents = float32_t2(1.0f, 1.0f);
+      sampler = sampling::SphericalRectangle<float32_t>::create(basisBase, r0, extents);
+  #else
+      // variant 1 (default): create(shape, observer).
+      shapes::CompressedSphericalRectangle<float32_t> compressed;
+      compressed.origin = float32_t3(perturbation, perturbation, -2.0f);
+      compressed.right = float32_t3(1.0f, 0.0f, 0.0f);
+      compressed.up = float32_t3(0.0f, 1.0f, 0.0f);
+      shapes::SphericalRectangle<float32_t> rect = shapes::SphericalRectangle<float32_t>::create(compressed);
+      sampler = sampling::SphericalRectangle<float32_t>::create(rect, float32_t3(0.0f, 0.0f, 0.0f));
+  #endif
+      for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++)
+      {
+         float32_t2 u = float32_t2(rng(), rng()) * toFloat;
+         sampling::SphericalRectangle<float32_t>::cache_type cache;
+         float32_t3 generated = sampler.generate(u, cache);
+         acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
+         acc ^= asuint(sampler.forwardPdf(u, cache));
+      }
+   }
+#endif
    benchOutput.Store(invID * 4u, acc);
 #else
    SphericalRectangleTestExecutor executor;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl
index 55991bcb3..3595ac86a 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl
@@ -11,32 +11,50 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<SphericalTriangleTestResults> outputTestValues;
 #endif
 
+#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS)
+#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
+#endif
+
 #ifndef WORKGROUP_SIZE
 #define WORKGROUP_SIZE 64
 #endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")]
 void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
 #ifdef BENCH_ITERS
-	// Perturb vertices by invID so the sampler is non-uniform across threads.
-	const float32_t perturbation = float32_t(invID) * 1.0e-7f;
-	const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) };
-	shapes::SphericalTriangle<float32_t> shape = shapes::SphericalTriangle<float32_t>::createFromUnitSphereVertices(verts);
-	sampling::SphericalTriangle<float32_t> sampler = sampling::SphericalTriangle<float32_t>::create(shape);
+	const float32_t perturbationBase = float32_t(invID) * 1.0e-7f;
 
 	nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u));
 	const float32_t toFloat = asfloat(0x2f800004u);
 	uint32_t acc = 0u;
+#ifdef BENCH_CREATE_ONLY
 	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
 	{
-		float32_t2 u = float32_t2(rng(), rng()) * toFloat;
-		sampling::SphericalTriangle<float32_t>::cache_type cache;
-		float32_t3 generated = sampler.generate(u, cache);
-		acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
-		acc ^= asuint(sampler.forwardPdf(u, cache));
+		const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f;
+		const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) };
+		shapes::SphericalTriangle<float32_t> shape = shapes::SphericalTriangle<float32_t>::createFromUnitSphereVertices(verts);
+		sampling::SphericalTriangle<float32_t> sampler = sampling::SphericalTriangle<float32_t>::create(shape);
+		acc ^= asuint(sampler.backwardPdf(float32_t3(0.0f, 0.0f, 1.0f)));
+	}
+#else
+	const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE);
+	for (uint32_t j = 0u; j < outerIters; j++)
+	{
+		const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f;
+		const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) };
+		shapes::SphericalTriangle<float32_t> shape = shapes::SphericalTriangle<float32_t>::createFromUnitSphereVertices(verts);
+		sampling::SphericalTriangle<float32_t> sampler = sampling::SphericalTriangle<float32_t>::create(shape);
+		for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++)
+		{
+			float32_t2 u = float32_t2(rng(), rng()) * toFloat;
+			sampling::SphericalTriangle<float32_t>::cache_type cache;
+			float32_t3 generated = sampler.generate(u, cache);
+			acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
+			acc ^= asuint(sampler.forwardPdf(u, cache));
+		}
 	}
+#endif
 	benchOutput.Store(invID * 4u, acc);
 #else
 	SphericalTriangleTestExecutor executor;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl
index 908520243..cd43c630e 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl
@@ -1,4 +1,8 @@
+#pragma shader_stage(compute)
+
 // Compile test: instantiate all sampling types and their concept-required methods to verify DXC compilation
+#include <nbl/builtin/hlsl/concepts.hlsl>
+#include <nbl/builtin/hlsl/sampling/basic.hlsl>
 #include <nbl/builtin/hlsl/sampling/concentric_mapping.hlsl>
 #include <nbl/builtin/hlsl/sampling/polar_mapping.hlsl>
 #include <nbl/builtin/hlsl/sampling/linear.hlsl>
@@ -9,12 +13,15 @@
 #include <nbl/builtin/hlsl/sampling/spherical_triangle.hlsl>
 #include <nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl>
 #include <nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl>
+#include <nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl>
+#include <nbl/builtin/hlsl/sampling/alias_table.hlsl>
+#include <nbl/builtin/hlsl/sampling/cumulative_probability.hlsl>
+#include "../common/array_accessor.hlsl"
 using namespace nbl::hlsl;
 
 [[vk::binding(0, 0)]] RWStructuredBuffer<float32_t4> output;
 
 [numthreads(1, 1, 1)]
-[shader("compute")] 
 void main()
 {
    float32_t2 u2 = float32_t2(0.5, 0.5);
@@ -119,7 +126,7 @@ void main()
    // Octant triangle: all dot products between vertices are 0, so cos_sides=0, csc_sides=1
    const float32_t3 triVerts[3] = {float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1)};
    shapes::SphericalTriangle<float32_t> shapeTri = shapes::SphericalTriangle<float32_t>::createFromUnitSphereVertices(triVerts);
-   sampling::SphericalTriangle<float32_t, true> sphTri = sampling::SphericalTriangle<float32_t, true>::create(shapeTri);
+   sampling::SphericalTriangle<float32_t> sphTri = sampling::SphericalTriangle<float32_t>::create(shapeTri);
    sampling::SphericalTriangle<float32_t>::cache_type sphTriCache;
    float32_t3 stSample = sphTri.generate(u2, sphTriCache);
    acc.xyz += stSample;
@@ -129,7 +136,7 @@ void main()
    acc.x += sphTri.backwardPdf(stSample);
    acc.x += sphTri.backwardWeight(stSample);
 
-   // SphericalRectangle — generate, forwardPdf, backwardPdf, forwardWeight, backwardWeight
+   // SphericalRectangle — generate, generateSurfaceOffset, forwardPdf, backwardPdf, forwardWeight, backwardWeight
    shapes::CompressedSphericalRectangle<float32_t> csr;
    csr.origin = float32_t3(0.0, 0.0, -1.0);
    csr.right = float32_t3(1.0, 0.0, 0.0);
@@ -140,20 +147,81 @@ void main()
    sampling::SphericalRectangle<float32_t>::cache_type sphRectCache;
    float32_t3 srSample = sphRect.generate(u2, sphRectCache);
    acc.xyz += srSample;
+   acc.xy += sphRect.generateLocalBasisXY(u2, sphRectCache);
    acc.x += sphRect.forwardPdf(u2, sphRectCache);
    acc.x += sphRect.forwardWeight(u2, sphRectCache);
    acc.x += sphRect.backwardPdf(srSample);
    acc.x += sphRect.backwardWeight(srSample);
 
-   // ProjectedSphericalTriangle — generate, forwardPdf, backwardPdf, forwardWeight, backwardWeight
+   // ProjectedSphericalTriangle — generate, forwardPdf, forwardWeight, backwardWeight(L)
    sampling::ProjectedSphericalTriangle<float32_t> projTri = sampling::ProjectedSphericalTriangle<float32_t>::create(shapeTri, float32_t3(0.0, 0.0, 1.0), false);
    sampling::ProjectedSphericalTriangle<float32_t>::cache_type projTriCache;
    float32_t3 ptSample = projTri.generate(u2, projTriCache);
    acc.xyz += ptSample;
    acc.x += projTri.forwardPdf(u2, projTriCache);
    acc.x += projTri.forwardWeight(u2, projTriCache);
-   acc.x += projTri.backwardPdf(ptSample);
    acc.x += projTri.backwardWeight(ptSample);
 
+   // ProjectedSphericalRectangle (UsePdfAsWeight=true) — generate, forwardPdf, forwardWeight, backwardWeight(L)
+   const float32_t3 psrNormal = float32_t3(0.0, 0.0, 1.0);
+   sampling::ProjectedSphericalRectangle<float32_t, true> projRectPdf =
+      sampling::ProjectedSphericalRectangle<float32_t, true>::create(shapeRect, srObserver, psrNormal, false);
+   sampling::ProjectedSphericalRectangle<float32_t, true>::cache_type projRectPdfCache;
+   float32_t3 prPdfSample = projRectPdf.generate(u2, projRectPdfCache);
+   acc.xyz += prPdfSample;
+   acc.x += projRectPdf.forwardPdf(u2, projRectPdfCache);
+   acc.x += projRectPdf.forwardWeight(u2, projRectPdfCache);
+   acc.x += projRectPdf.backwardWeight(prPdfSample);
+
+   // ProjectedSphericalRectangle (UsePdfAsWeight=false) — exercise the MIS-weight path
+   sampling::ProjectedSphericalRectangle<float32_t, false> projRectMis =
+      sampling::ProjectedSphericalRectangle<float32_t, false>::create(shapeRect, srObserver, psrNormal, true);
+   sampling::ProjectedSphericalRectangle<float32_t, false>::cache_type projRectMisCache;
+   float32_t3 prMisSample = projRectMis.generate(u2, projRectMisCache);
+   acc.xyz += prMisSample;
+   acc.x += projRectMis.forwardPdf(u2, projRectMisCache);
+   acc.x += projRectMis.forwardWeight(u2, projRectMisCache);
+   acc.x += projRectMis.backwardWeight(prMisSample);
+
+   // AliasTable — generate (with/without cache), forwardPdf, backwardPdf, forwardWeight, backwardWeight
+   ArrayAccessor<float32_t, 4> aliasProb;
+   aliasProb.data[0] = 0.25; aliasProb.data[1] = 0.5; aliasProb.data[2] = 0.75; aliasProb.data[3] = 1.0;
+   ArrayAccessor<uint32_t, 4> aliasIdx;
+   aliasIdx.data[0] = 1u; aliasIdx.data[1] = 2u; aliasIdx.data[2] = 3u; aliasIdx.data[3] = 0u;
+   ArrayAccessor<float32_t, 4> aliasPdf;
+   aliasPdf.data[0] = 0.25; aliasPdf.data[1] = 0.25; aliasPdf.data[2] = 0.25; aliasPdf.data[3] = 0.25;
+   sampling::AliasTable<float32_t, float32_t, uint32_t, ArrayAccessor<float32_t, 4>, ArrayAccessor<uint32_t, 4>, ArrayAccessor<float32_t, 4> > aliasTable =
+      sampling::AliasTable<float32_t, float32_t, uint32_t, ArrayAccessor<float32_t, 4>, ArrayAccessor<uint32_t, 4>, ArrayAccessor<float32_t, 4> >::create(aliasProb, aliasIdx, aliasPdf, 4u);
+   sampling::AliasTable<float32_t, float32_t, uint32_t, ArrayAccessor<float32_t, 4>, ArrayAccessor<uint32_t, 4>, ArrayAccessor<float32_t, 4> >::cache_type aliasCache;
+   uint32_t aliasBin0 = aliasTable.generate(0.3);
+   uint32_t aliasBin = aliasTable.generate(0.3, aliasCache);
+   acc.x += float32_t(aliasBin0 + aliasBin);
+   acc.x += aliasTable.forwardPdf(0.3, aliasCache);
+   acc.x += aliasTable.forwardWeight(0.3, aliasCache);
+   acc.x += aliasTable.backwardPdf(aliasBin);
+   acc.x += aliasTable.backwardWeight(aliasBin);
+
+   // CumulativeProbabilitySampler — generate (with/without cache), forwardPdf, backwardPdf, forwardWeight, backwardWeight
+   ArrayAccessor<float32_t, 3> cumProb;
+   cumProb.data[0] = 0.25; cumProb.data[1] = 0.5; cumProb.data[2] = 0.75;
+   sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, ArrayAccessor<float32_t, 3> > cumSampler =
+      sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, ArrayAccessor<float32_t, 3> >::create(cumProb, 4u);
+   sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, ArrayAccessor<float32_t, 3> >::cache_type cumCache;
+   uint32_t cumBin0 = cumSampler.generate(0.6);
+   uint32_t cumBin = cumSampler.generate(0.6, cumCache);
+   acc.x += float32_t(cumBin0 + cumBin);
+   acc.x += cumSampler.forwardPdf(0.6, cumCache);
+   acc.x += cumSampler.forwardWeight(0.6, cumCache);
+   acc.x += cumSampler.backwardPdf(cumBin);
+   acc.x += cumSampler.backwardWeight(cumBin);
+
+   // PartitionRandVariable — operator() partitions u into a left/right branch
+   sampling::PartitionRandVariable<float32_t> partition;
+   partition.leftProb = 0.25;
+   float32_t partXi = 0.5;
+   float32_t partRcp;
+   bool partRight = partition(partXi, partRcp);
+   acc.x += partXi + partRcp + float32_t(partRight ? 1 : 0);
+
    output[0] = acc;
 }
diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl
index d0990ef43..3c43ee119 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl
@@ -11,6 +11,10 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<UniformHemisphereTestResults> outputTestValues;
 #endif
 
+#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS)
+#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
+#endif
+
 #ifndef WORKGROUP_SIZE
 #define WORKGROUP_SIZE 64
 #endif
@@ -23,14 +27,18 @@ void main()
 	nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u));
 	const float32_t toFloat = asfloat(0x2f800004u);
 	uint32_t acc = 0u;
-	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
+	const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE);
+	for (uint32_t j = 0u; j < outerIters; j++)
 	{
-		float32_t2 u = float32_t2(rng(), rng()) * toFloat;
 		sampling::UniformHemisphere<float32_t> sampler;
-		sampling::UniformHemisphere<float32_t>::cache_type cache;
-		float32_t3 generated = sampler.generate(u, cache);
-		acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
-		acc ^= asuint(sampler.forwardPdf(u, cache));
+		for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++)
+		{
+			float32_t2 u = float32_t2(rng(), rng()) * toFloat;
+			sampling::UniformHemisphere<float32_t>::cache_type cache;
+			float32_t3 generated = sampler.generate(u, cache);
+			acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
+			acc ^= asuint(sampler.forwardPdf(u, cache));
+		}
 	}
 	benchOutput.Store(invID * 4u, acc);
 #else
diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl
index 0d33f5c11..5879e28bb 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl
@@ -11,6 +11,10 @@
 [[vk::binding(1, 0)]] RWStructuredBuffer<UniformSphereTestResults> outputTestValues;
 #endif
 
+#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS)
+#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
+#endif
+
 #ifndef WORKGROUP_SIZE
 #define WORKGROUP_SIZE 64
 #endif
@@ -23,14 +27,18 @@ void main()
 	nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u));
 	const float32_t toFloat = asfloat(0x2f800004u);
 	uint32_t acc = 0u;
-	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
+	const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE);
+	for (uint32_t j = 0u; j < outerIters; j++)
 	{
-		float32_t2 u = float32_t2(rng(), rng()) * toFloat;
 		sampling::UniformSphere<float32_t> sampler;
-		sampling::UniformSphere<float32_t>::cache_type cache;
-		float32_t3 generated = sampler.generate(u, cache);
-		acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
-		acc ^= asuint(sampler.forwardPdf(u, cache));
+		for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++)
+		{
+			float32_t2 u = float32_t2(rng(), rng()) * toFloat;
+			sampling::UniformSphere<float32_t>::cache_type cache;
+			float32_t3 generated = sampler.generate(u, cache);
+			acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z);
+			acc ^= asuint(sampler.forwardPdf(u, cache));
+		}
 	}
 	benchOutput.Store(invID * 4u, acc);
 #else
diff --git a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h
index 8f85545b3..02fbf58d2 100644
--- a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h
+++ b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h
@@ -12,8 +12,11 @@
 using namespace nbl;
 
 // Benchmarks alias table vs cumulative probability sampler on the GPU using BDA.
-// Builds both tables from the same weight distribution, uploads via BDA buffers,
-// and measures GPU throughput using timestamp queries.
+// Builds pipelines once, then sweeps a list of table sizes. For each N it builds
+// both tables from the same weight distribution, uploads via BDA buffers, and
+// measures GPU throughput using timestamp queries. The cumulative probability
+// sampler is run in two variants: the stateful-comparator cache population
+// (default) and the "YOLO re-read" variant (cumulative_probability.hlsl).
 class CDiscreteSamplerBenchmark
 {
    public:
@@ -26,17 +29,17 @@ class CDiscreteSamplerBenchmark
       video::IPhysicalDevice* physicalDevice;
       std::string aliasShaderKey;
       std::string cumProbShaderKey;
+      std::string cumProbYoloShaderKey;
       uint32_t computeFamilyIndex;
       uint32_t dispatchGroupCount;
-      uint32_t tableSize;
    };
 
    void setup(const SetupData& data)
    {
       m_device = data.device;
       m_logger = data.logger;
+      m_assetMgr = data.assetMgr;
       m_dispatchGroupCount = data.dispatchGroupCount;
-      m_tableSize = data.tableSize;
       m_physicalDevice = data.physicalDevice;
 
       m_queue = m_device->getQueue(data.computeFamilyIndex, 0);
@@ -44,8 +47,6 @@ class CDiscreteSamplerBenchmark
       // Command pool + buffers
       m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
       m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchCmdbuf);
-      m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdbuf);
-      m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdbuf);
 
       // Timestamp query pool
       {
@@ -56,61 +57,9 @@ class CDiscreteSamplerBenchmark
          m_queryPool = m_device->createQueryPool(qp);
       }
 
-      // Generate random weights
-      const uint32_t N = m_tableSize;
-      std::vector<float> weights(N);
-      std::mt19937 rng(42);
-      std::uniform_real_distribution<float> dist(0.001f, 100.0f);
-      for (uint32_t i = 0; i < N; i++)
-         weights[i] = dist(rng);
-
-      // Build alias table
-      std::vector<float> aliasProb(N);
-      std::vector<uint32_t> aliasIdx(N);
-      std::vector<float> aliasPdf(N);
-      std::vector<uint32_t> workspace(N);
-      nbl::hlsl::sampling::AliasTableBuilder<float>::build({weights}, aliasProb.data(), aliasIdx.data(), aliasPdf.data(), workspace.data());
-
-      // Build cumulative probability table
-      std::vector<float> cumProb(N - 1);
-      nbl::hlsl::sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data());
-
-      // Create BDA buffers and upload data
-      auto createBdaBuffer = [&](const void* srcData, size_t bytes) -> core::smart_refctd_ptr<video::IGPUBuffer>
-      {
-         video::IGPUBuffer::SCreationParams bp = {};
-         bp.size = bytes;
-         bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) |
-            video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-         auto buf = m_device->createBuffer(std::move(bp));
-
-         video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buf->getMemoryReqs();
-         reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits();
-         auto alloc = m_device->allocate(reqs, buf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-
-         const auto allocSize = alloc.memory->getAllocationSize();
-         if (alloc.memory->map({0ull, allocSize}, video::IDeviceMemoryAllocation::EMCAF_WRITE))
-         {
-            std::memcpy(alloc.memory->getMappedPointer(), srcData, bytes);
-            // Flush so GPU can see the written data
-            video::ILogicalDevice::MappedMemoryRange flushRange(alloc.memory.get(), 0ull, allocSize);
-            m_device->flushMappedMemoryRanges(1u, &flushRange);
-            alloc.memory->unmap();
-         }
-         return buf;
-      };
-
       const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE;
 
-      // Alias table buffers
-      m_aliasProbBuf = createBdaBuffer(aliasProb.data(), N * sizeof(float));
-      m_aliasIdxBuf = createBdaBuffer(aliasIdx.data(), N * sizeof(uint32_t));
-      m_aliasPdfBuf = createBdaBuffer(aliasPdf.data(), N * sizeof(float));
-
-      // CDF buffer
-      m_cumProbBuf = createBdaBuffer(cumProb.data(), (N - 1) * sizeof(float));
-
-      // Shared output buffer
+      // Shared output buffer (size only depends on thread count)
       {
          video::IGPUBuffer::SCreationParams bp = {};
          bp.size = totalThreads * sizeof(uint32_t);
@@ -122,163 +71,218 @@ class CDiscreteSamplerBenchmark
          m_device->allocate(reqs, m_outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
       }
 
-      // Create pipelines (push constants only, no descriptor sets)
-      auto loadShader = [&](const std::string& key)
+      // Pipelines (N-independent; only push constants change per run)
+      m_aliasPipeline = createPipeline<AliasTablePushConstants>(data.aliasShaderKey, m_aliasPplnLayout, "alias");
+      m_cumProbPipeline = createPipeline<CumProbPushConstants>(data.cumProbShaderKey, m_cumProbPplnLayout, "cumprob-comparator");
+      m_cumProbYoloPipeline = createPipeline<CumProbPushConstants>(data.cumProbYoloShaderKey, m_cumProbYoloPplnLayout, "cumprob-yolo");
+   }
+
+   // DispatchScheduler: uint32_t N -> std::pair<uint32_t warmup, uint32_t bench>.
+   // Lets the caller trade wall-clock for statistical stability per size:
+   // big-N runs are DRAM-bound and need fewer dispatches to hit the same total sample count.
+   struct DispatchCounts { uint32_t warmup; uint32_t bench; };
+
+   // Sweep a list of table sizes. For each N: build tables from a fresh weight
+   // distribution (deterministic seed = 42 + N so different N's get distinct
+   // distributions but runs are reproducible), upload via BDA, then run all
+   // three samplers with the dispatch counts chosen by `scheduler`.
+   template<typename DispatchScheduler>
+   void runSweep(const std::vector<uint32_t>& tableSizes, DispatchScheduler scheduler)
+   {
+      const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE;
+      m_logger->log("=== GPU Discrete Sampler Benchmark sweep (%u threads * %u iters/thread; wg=%u; dispatches chosen per-N) ===",
+         system::ILogger::ELL_PERFORMANCE, totalThreads, BENCH_ITERS, WORKGROUP_SIZE);
+      m_logger->log("%12s | %-28s | %12s | %12s | %12s | %10s",
+         system::ILogger::ELL_PERFORMANCE, "N", "Sampler", "ps/sample", "GSamples/s", "ms total", "dispatches");
+
+      for (uint32_t N : tableSizes)
+      {
+         const DispatchCounts dc = scheduler(N);
+         buildAndUpload(N);
+         runSingle(N, "AliasTable",                    m_aliasPipeline,       m_aliasPplnLayout,       SamplerKind::Alias,           dc.warmup, dc.bench);
+         runSingle(N, "CumulativeProbability",         m_cumProbPipeline,     m_cumProbPplnLayout,     SamplerKind::CumProbCompare,  dc.warmup, dc.bench);
+         runSingle(N, "CumulativeProbability (YOLO)",  m_cumProbYoloPipeline, m_cumProbYoloPplnLayout, SamplerKind::CumProbYolo,     dc.warmup, dc.bench);
+         releaseTables();
+      }
+   }
+
+   // Convenience: sweep with fixed dispatch counts for every size.
+   void runSweep(const std::vector<uint32_t>& tableSizes, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000)
+   {
+      runSweep(tableSizes, [warmupIterations, benchmarkIterations](uint32_t) -> DispatchCounts {
+         return {warmupIterations, benchmarkIterations};
+      });
+   }
+
+   private:
+   enum class SamplerKind { Alias, CumProbCompare, CumProbYolo };
+
+   template<typename PushConstantT>
+   core::smart_refctd_ptr<video::IGPUComputePipeline> createPipeline(const std::string& shaderKey, core::smart_refctd_ptr<video::IGPUPipelineLayout>& outLayout, const char* tag)
+   {
+      const asset::SPushConstantRange pcRange = {
+         .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
+         .offset = 0,
+         .size = sizeof(PushConstantT)};
+      auto layout = m_device->createPipelineLayout({&pcRange, 1});
+      if (!layout)
+         m_logger->log("CDiscreteSamplerBenchmark: failed to create %s pipeline layout", system::ILogger::ELL_ERROR, tag);
+
+      asset::IAssetLoader::SAssetLoadParams lp = {};
+      lp.logger = m_logger.get();
+      lp.workingDirectory = "app_resources";
+      auto bundle = m_assetMgr->getAsset(shaderKey, lp);
+      auto source = asset::IAsset::castDown<asset::IShader>(bundle.getContents()[0]);
+      auto shader = m_device->compileShader({.source = source.get()});
+      if (!shader)
+         m_logger->log("CDiscreteSamplerBenchmark: failed to load %s shader", system::ILogger::ELL_ERROR, tag);
+
+      video::IGPUComputePipeline::SCreationParams pp = {};
+      pp.layout = layout.get();
+      pp.shader.shader = shader.get();
+      pp.shader.entryPoint = "main";
+      if (m_device->getEnabledFeatures().pipelineExecutableInfo)
       {
-         asset::IAssetLoader::SAssetLoadParams lp = {};
-         lp.logger = m_logger.get();
-         lp.workingDirectory = "app_resources";
-         auto bundle = data.assetMgr->getAsset(key, lp);
-         auto source = asset::IAsset::castDown<asset::IShader>(bundle.getContents()[0]);
-         return m_device->compileShader({.source = source.get()});
-      };
-
-      // Alias table pipeline
+         pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS;
+      }
+
+      core::smart_refctd_ptr<video::IGPUComputePipeline> pipeline;
+      if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &pipeline))
+         m_logger->log("CDiscreteSamplerBenchmark: failed to create %s compute pipeline", system::ILogger::ELL_ERROR, tag);
+
+      if (m_device->getEnabledFeatures().pipelineExecutableInfo)
       {
-         const asset::SPushConstantRange pcRange = {
-            .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
-            .offset = 0,
-            .size = sizeof(AliasTablePushConstants)};
-         auto layout = m_device->createPipelineLayout({&pcRange, 1});
-         if (!layout)
-            m_logger->log("CDiscreteSamplerBenchmark: failed to create alias pipeline layout", system::ILogger::ELL_ERROR);
-         video::IGPUComputePipeline::SCreationParams pp = {};
-         pp.layout = layout.get();
-         auto shader = loadShader(data.aliasShaderKey);
-         if (!shader)
-            m_logger->log("CDiscreteSamplerBenchmark: failed to load alias shader", system::ILogger::ELL_ERROR);
-         pp.shader.shader = shader.get();
-         pp.shader.entryPoint = "main";
-
-         if (m_device->getEnabledFeatures().pipelineExecutableInfo)
-         {
-            pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS;
-         }
-
-         if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &m_aliasPipeline))
-            m_logger->log("CDiscreteSamplerBenchmark: failed to create alias compute pipeline", system::ILogger::ELL_ERROR);
-
-         if (m_device->getEnabledFeatures().pipelineExecutableInfo)
-         {
-            auto report = system::to_string(m_aliasPipeline->getExecutableInfo());
-            m_logger->log("Alias Table Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, report.c_str());
-         }
-         m_aliasPplnLayout = std::move(layout);
+         auto report = system::to_string(pipeline->getExecutableInfo());
+         m_logger->log("%s Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, tag, report.c_str());
       }
+      outLayout = std::move(layout);
+      return pipeline;
+   }
 
-      // CDF pipeline
+   core::smart_refctd_ptr<video::IGPUBuffer> createBdaBuffer(const void* srcData, size_t bytes)
+   {
+      video::IGPUBuffer::SCreationParams bp = {};
+      bp.size = bytes;
+      bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) |
+         video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+      auto buf = m_device->createBuffer(std::move(bp));
+
+      video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buf->getMemoryReqs();
+      reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
+      auto alloc = m_device->allocate(reqs, buf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+
+      const auto allocSize = alloc.memory->getAllocationSize();
+      if (alloc.memory->map({0ull, allocSize}, video::IDeviceMemoryAllocation::EMCAF_WRITE))
       {
-         const asset::SPushConstantRange pcRange = {
-            .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
-            .offset = 0,
-            .size = sizeof(CumProbPushConstants)};
-         auto layout = m_device->createPipelineLayout({&pcRange, 1});
-         if (!layout)
-            m_logger->log("CDiscreteSamplerBenchmark: failed to create cumprob pipeline layout", system::ILogger::ELL_ERROR);
-         video::IGPUComputePipeline::SCreationParams pp = {};
-         pp.layout = layout.get();
-         auto shader = loadShader(data.cumProbShaderKey);
-         if (!shader)
-            m_logger->log("CDiscreteSamplerBenchmark: failed to load cumprob shader", system::ILogger::ELL_ERROR);
-         pp.shader.shader = shader.get();
-         pp.shader.entryPoint = "main";
-         if (m_device->getEnabledFeatures().pipelineExecutableInfo)
-         {
-            pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS;
-         }
-         if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &m_cumProbPipeline))
-            m_logger->log("CDiscreteSamplerBenchmark: failed to create cumprob compute pipeline", system::ILogger::ELL_ERROR);
-         if (m_device->getEnabledFeatures().pipelineExecutableInfo)
-         {
-            auto report = system::to_string(m_cumProbPipeline->getExecutableInfo());
-            m_logger->log("Cumulative Probability Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, report.c_str());
-         }
-         m_cumProbPplnLayout = std::move(layout);
+         std::memcpy(alloc.memory->getMappedPointer(), srcData, bytes);
+         video::ILogicalDevice::MappedMemoryRange flushRange(alloc.memory.get(), 0ull, allocSize);
+         m_device->flushMappedMemoryRanges(1u, &flushRange);
+         alloc.memory->unmap();
       }
+      return buf;
    }
 
-   void run(uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000)
+   void buildAndUpload(uint32_t N)
    {
-      constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE;
-      const uint32_t totalThreads = m_dispatchGroupCount * benchWorkgroupSize;
-      m_logger->log("=== GPU Discrete Sampler Benchmark (N=%u, %u dispatches, %u threads/dispatch, %u iters/thread, ps/sample is per all GPU threads) ===",
-         system::ILogger::ELL_PERFORMANCE, m_tableSize, benchmarkIterations, totalThreads, BENCH_ITERS);
+      m_currentN = N;
+
+      std::vector<float> weights(N);
+      std::mt19937 rng(42u + N);
+      std::uniform_real_distribution<float> dist(0.001f, 100.0f);
+      for (uint32_t i = 0; i < N; i++)
+         weights[i] = dist(rng);
+
+      // Alias table
+      std::vector<float> aliasProb(N);
+      std::vector<uint32_t> aliasIdx(N);
+      std::vector<float> aliasPdf(N);
+      std::vector<uint32_t> workspace(N);
+      nbl::hlsl::sampling::AliasTableBuilder<float>::build({weights}, aliasProb.data(), aliasIdx.data(), aliasPdf.data(), workspace.data());
+
+      // Cumulative probability (N-1 entries, last bucket implicitly 1.0)
+      std::vector<float> cumProb(N > 0 ? N - 1 : 0);
+      nbl::hlsl::sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data());
 
-      runSingle("AliasTable", m_aliasPipeline, m_aliasPplnLayout, true, warmupIterations, benchmarkIterations);
-      runSingle("CumulativeProbability", m_cumProbPipeline, m_cumProbPplnLayout, false, warmupIterations, benchmarkIterations);
+      m_aliasProbBuf = createBdaBuffer(aliasProb.data(), N * sizeof(float));
+      m_aliasIdxBuf  = createBdaBuffer(aliasIdx.data(), N * sizeof(uint32_t));
+      m_aliasPdfBuf  = createBdaBuffer(aliasPdf.data(), N * sizeof(float));
+      const size_t cumProbBytes = (N > 0 ? (N - 1) : 0) * sizeof(float);
+      m_cumProbBuf = cumProbBytes ? createBdaBuffer(cumProb.data(), cumProbBytes) : nullptr;
    }
 
-   private:
-   void runSingle(const char* name, const core::smart_refctd_ptr<video::IGPUComputePipeline>& pipeline, const core::smart_refctd_ptr<video::IGPUPipelineLayout>& layout, bool isAlias, uint32_t warmupIterations, uint32_t benchmarkIterations)
+   void releaseTables()
+   {
+      m_aliasProbBuf = nullptr;
+      m_aliasIdxBuf  = nullptr;
+      m_aliasPdfBuf  = nullptr;
+      m_cumProbBuf   = nullptr;
+   }
+
+   void runSingle(
+      uint32_t N,
+      const char* name,
+      const core::smart_refctd_ptr<video::IGPUComputePipeline>& pipeline,
+      const core::smart_refctd_ptr<video::IGPUPipelineLayout>& layout,
+      SamplerKind kind,
+      uint32_t warmupIterations,
+      uint32_t benchmarkIterations)
    {
       m_device->waitIdle();
 
-      // Record benchmark command buffer
+      // Everything (warmup, timestamped bench, cooldown) goes into ONE cmdbuf and ONE
+      // submit. Serial submissions with semaphore waits between them would add sync cost
+      // to every dispatch and prevent the driver from overlapping adjacent dispatches.
+      // With a single cmdbuf the driver pipelines freely, and GPU memory latency is
+      // hidden by warp hyperthreading rather than by cross-submit overlap.
+      //
+      // Layout: [warmup dispatches] [ts 0] [bench dispatches] [ts 1] [cooldown dispatches]
+      // Warmup brings clocks + caches to steady state before ts 0. Cooldown keeps the
+      // same steady-state context alive across ts 1 so the trailing bench dispatches
+      // don't measure a tail where the GPU is already winding down.
+      const uint32_t cooldownIterations = warmupIterations;
+
       m_benchCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE);
-      m_benchCmdbuf->begin(video::IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT);
+      m_benchCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+      m_benchCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
       m_benchCmdbuf->bindComputePipeline(pipeline.get());
 
-      if (isAlias)
+      if (kind == SamplerKind::Alias)
       {
          AliasTablePushConstants pc = {};
-         pc.probAddress = m_aliasProbBuf->getDeviceAddress();
+         pc.probAddress  = m_aliasProbBuf->getDeviceAddress();
          pc.aliasAddress = m_aliasIdxBuf->getDeviceAddress();
-         pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress();
+         pc.pdfAddress   = m_aliasPdfBuf->getDeviceAddress();
          pc.outputAddress = m_outputBuf->getDeviceAddress();
-         pc.tableSize = m_tableSize;
+         pc.tableSize = N;
          m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
       }
       else
       {
          CumProbPushConstants pc = {};
-         pc.cumProbAddress = m_cumProbBuf->getDeviceAddress();
-         pc.outputAddress = m_outputBuf->getDeviceAddress();
-         pc.tableSize = m_tableSize;
+         pc.cumProbAddress = m_cumProbBuf ? m_cumProbBuf->getDeviceAddress() : 0ull;
+         pc.outputAddress  = m_outputBuf->getDeviceAddress();
+         pc.tableSize = N;
          m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
       }
 
-      m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
+      for (uint32_t i = 0u; i < warmupIterations; ++i)
+         m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
+      m_benchCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0);
+      for (uint32_t i = 0u; i < benchmarkIterations; ++i)
+         m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
+      m_benchCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1);
+      for (uint32_t i = 0u; i < cooldownIterations; ++i)
+         m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
       m_benchCmdbuf->end();
 
-      // Record timestamp command buffers
-      m_timestampBeforeCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE);
-      m_timestampBeforeCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-      m_timestampBeforeCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
-      m_timestampBeforeCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0);
-      m_timestampBeforeCmdbuf->end();
-
-      m_timestampAfterCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE);
-      m_timestampAfterCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-      m_timestampAfterCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1);
-      m_timestampAfterCmdbuf->end();
-
       auto semaphore = m_device->createSemaphore(0u);
-      uint64_t semCounter = 0u;
-
       const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_benchCmdbuf.get()}};
-      const video::IQueue::SSubmitInfo::SCommandBufferInfo beforeCmds[] = {{.cmdbuf = m_timestampBeforeCmdbuf.get()}};
-      const video::IQueue::SSubmitInfo::SCommandBufferInfo afterCmds[] = {{.cmdbuf = m_timestampAfterCmdbuf.get()}};
-
-      auto submitSerial = [&](const video::IQueue::SSubmitInfo::SCommandBufferInfo* cmds, uint32_t count)
-      {
-         const video::IQueue::SSubmitInfo::SSemaphoreInfo waitSem[] = {
-            {.semaphore = semaphore.get(), .value = semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}};
-         const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = {
-            {.semaphore = semaphore.get(), .value = ++semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}};
-         video::IQueue::SSubmitInfo submit = {};
-         submit.commandBuffers = {cmds, count};
-         submit.waitSemaphores = waitSem;
-         submit.signalSemaphores = signalSem;
-         m_queue->submit({&submit, 1u});
-      };
-
-      for (uint32_t i = 0u; i < warmupIterations; ++i)
-         submitSerial(benchCmds, 1u);
-
-      submitSerial(beforeCmds, 1u);
-      for (uint32_t i = 0u; i < benchmarkIterations; ++i)
-         submitSerial(benchCmds, 1u);
-      submitSerial(afterCmds, 1u);
+      const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = {
+         {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}};
+      video::IQueue::SSubmitInfo submit = {};
+      submit.commandBuffers = benchCmds;
+      submit.signalSemaphores = signalSem;
+      m_queue->submit({&submit, 1u});
 
       m_device->waitIdle();
 
@@ -288,36 +292,37 @@ class CDiscreteSamplerBenchmark
       m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags);
 
       constexpr uint32_t benchIters = BENCH_ITERS;
-      constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE;
       const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds);
       const float64_t elapsed_ns = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod;
-      const uint64_t totalThreads = uint64_t(m_dispatchGroupCount) * uint64_t(benchWorkgroupSize);
+      const uint64_t totalThreads = uint64_t(m_dispatchGroupCount) * uint64_t(WORKGROUP_SIZE);
       const uint64_t totalSamples = uint64_t(benchmarkIterations) * totalThreads * uint64_t(benchIters);
       const float64_t ps_per_sample = elapsed_ns * 1e3 / float64_t(totalSamples);
       const float64_t gsamples_per_s = float64_t(totalSamples) / elapsed_ns;
       const float64_t elapsed_ms = elapsed_ns * 1e-6;
 
-      m_logger->log("[Benchmark] %-28s: %9.3f ps/sample  |  %10.3f GSamples/s  |  %10.3f ms total", system::ILogger::ELL_PERFORMANCE, name, ps_per_sample, gsamples_per_s, elapsed_ms);
+      m_logger->log("%12u | %-28s | %12.3f | %12.3f | %12.3f | %10u",
+         system::ILogger::ELL_PERFORMANCE, N, name, ps_per_sample, gsamples_per_s, elapsed_ms, benchmarkIterations);
    }
 
    core::smart_refctd_ptr<video::ILogicalDevice> m_device;
    core::smart_refctd_ptr<system::ILogger> m_logger;
+   core::smart_refctd_ptr<asset::IAssetManager> m_assetMgr;
    core::smart_refctd_ptr<video::IGPUCommandPool> m_cmdpool;
    core::smart_refctd_ptr<video::IGPUCommandBuffer> m_benchCmdbuf;
-   core::smart_refctd_ptr<video::IGPUCommandBuffer> m_timestampBeforeCmdbuf;
-   core::smart_refctd_ptr<video::IGPUCommandBuffer> m_timestampAfterCmdbuf;
    core::smart_refctd_ptr<video::IQueryPool> m_queryPool;
 
-   // Alias table
+   // Pipelines (set up once)
    core::smart_refctd_ptr<video::IGPUPipelineLayout> m_aliasPplnLayout;
    core::smart_refctd_ptr<video::IGPUComputePipeline> m_aliasPipeline;
+   core::smart_refctd_ptr<video::IGPUPipelineLayout> m_cumProbPplnLayout;
+   core::smart_refctd_ptr<video::IGPUComputePipeline> m_cumProbPipeline;
+   core::smart_refctd_ptr<video::IGPUPipelineLayout> m_cumProbYoloPplnLayout;
+   core::smart_refctd_ptr<video::IGPUComputePipeline> m_cumProbYoloPipeline;
+
+   // Per-N data buffers (rebuilt each sweep step)
    core::smart_refctd_ptr<video::IGPUBuffer> m_aliasProbBuf;
    core::smart_refctd_ptr<video::IGPUBuffer> m_aliasIdxBuf;
    core::smart_refctd_ptr<video::IGPUBuffer> m_aliasPdfBuf;
-
-   // Cumulative probability
-   core::smart_refctd_ptr<video::IGPUPipelineLayout> m_cumProbPplnLayout;
-   core::smart_refctd_ptr<video::IGPUComputePipeline> m_cumProbPipeline;
    core::smart_refctd_ptr<video::IGPUBuffer> m_cumProbBuf;
 
    // Shared
@@ -325,7 +330,7 @@ class CDiscreteSamplerBenchmark
    video::IQueue* m_queue = nullptr;
    video::IPhysicalDevice* m_physicalDevice = nullptr;
    uint32_t m_dispatchGroupCount = 0;
-   uint32_t m_tableSize = 0;
+   uint32_t m_currentN = 0;
 };
 
 #endif
diff --git a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
index 3e2092670..9f9854ac5 100644
--- a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
+++ b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
@@ -162,7 +162,7 @@ class CSamplerBenchmark
 	}
 
 	// Runs warmupIterations submits (unclocked), then benchmarkIterations submits under GPU timestamps.
-	void run(const std::string& samplerName, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000)
+	void run(const std::string& samplerName, const std::string& mode, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000)
 	{
 		m_device->waitIdle();
 		recordBenchmarkCmdBuf();
@@ -213,9 +213,9 @@ class CSamplerBenchmark
 		const float64_t gsamples_per_s  = float64_t(total_samples) / elapsed_ns;
 		const float64_t elapsed_ms      = elapsed_ns * 1e-6;
 
-		m_logger->log("[Benchmark] %-28s: %9.3f ps/sample  |  %10.3f GSamples/s  |  %10.3f ms total",
+		m_logger->log("[Benchmark] %-28s | %-38s | %12.3f | %12.3f | %12.3f",
 			system::ILogger::ELL_PERFORMANCE,
-			samplerName.c_str(), ps_per_sample, gsamples_per_s, elapsed_ms);
+			samplerName.c_str(), mode.c_str(), ps_per_sample, gsamples_per_s, elapsed_ms);
 	}
 
 private:
diff --git a/37_HLSLSamplingTests/main.cpp b/37_HLSLSamplingTests/main.cpp
index 98ea127cc..470132aba 100644
--- a/37_HLSLSamplingTests/main.cpp
+++ b/37_HLSLSamplingTests/main.cpp
@@ -51,12 +51,11 @@ using namespace nbl::examples;
 #include "benchmarks/CDiscreteSamplerBenchmark.h"
 #include "tests/property/CSamplerPropertyTester.h"
 
-constexpr bool DoBenchmark = true;
 
 class HLSLSamplingTests final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
 {
    using device_base_t = application_templates::MonoDeviceApplication;
-   using asset_base_t = BuiltinResourcesApplication;
+   using asset_base_t  = BuiltinResourcesApplication;
 
    public:
    HLSLSamplingTests(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD)
@@ -64,7 +63,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
 
    virtual SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override
    {
-      auto retval = device_base_t::getPreferredDeviceFeatures();
+      auto retval                   = device_base_t::getPreferredDeviceFeatures();
       retval.pipelineExecutableInfo = true;
       return retval;
    }
@@ -80,10 +79,10 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       // test compile with dxc
       {
          IAssetLoader::SAssetLoadParams lp = {};
-         lp.logger = m_logger.get();
-         lp.workingDirectory = "app_resources";
-         auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get());
-         auto bundle = m_assetMgr->getAsset(key.c_str(), lp);
+         lp.logger                         = m_logger.get();
+         lp.workingDirectory               = "app_resources";
+         auto key                          = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get());
+         auto bundle                       = m_assetMgr->getAsset(key.c_str(), lp);
 
          const auto assets = bundle.getContents();
          if (assets.empty())
@@ -155,8 +154,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       static_assert(sampling::concepts::BackwardTractableSampler<sampling::ProjectedHemisphere<float>>);
       static_assert(sampling::concepts::BackwardTractableSampler<sampling::ProjectedSphere<float>>);
       static_assert(sampling::concepts::BackwardTractableSampler<sampling::SphericalTriangle<float>>);
-      static_assert(sampling::concepts::BackwardTractableSampler<sampling::ProjectedSphericalTriangle<float>>);
-      static_assert(sampling::concepts::BackwardTractableSampler<sampling::ProjectedSphericalRectangle<float>>);
+      //static_assert(sampling::concepts::BackwardTractableSampler<sampling::ProjectedSphericalTriangle<float>>); // no backwardPdf
+      //static_assert(sampling::concepts::BackwardTractableSampler<sampling::ProjectedSphericalRectangle<float>>);  // no backwardPdf
       static_assert(sampling::concepts::BackwardTractableSampler<sampling::SphericalRectangle<float>>);
       static_assert(sampling::concepts::BackwardTractableSampler<sampling::BoxMullerTransform<float>>);
       static_assert(sampling::concepts::BackwardTractableSampler<sampling::ConcentricMapping<float32_t>>);
@@ -166,7 +165,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       static_assert(sampling::concepts::BijectiveSampler<sampling::UniformHemisphere<float>>);
       static_assert(sampling::concepts::BijectiveSampler<sampling::UniformSphere<float>>);
       static_assert(sampling::concepts::BijectiveSampler<sampling::ProjectedHemisphere<float>>);
-      static_assert(sampling::concepts::BijectiveSampler<sampling::SphericalTriangle<float, true>>);
+      static_assert(sampling::concepts::BijectiveSampler<sampling::SphericalTriangle<float>>);
       static_assert(sampling::concepts::BijectiveSampler<sampling::ConcentricMapping<float>>);
       static_assert(sampling::concepts::BijectiveSampler<sampling::PolarMapping<float>>);
 
@@ -180,89 +179,162 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       // ======================================================================
       // GPU throughput benchmarks
       // ======================================================================
-      const uint32_t testBatchCount = 1024;
+      // 4096 workgroups * WORKGROUP_SIZE(64) = 256k invocations per dispatch — enough
+      // to saturate a 3080 (68 SMs * ~1536 resident invocations) so memory latency is
+      // hidden by hyperthreading rather than by cross-dispatch overlap.
+      constexpr uint32_t testBatchCount = 4096;
+      constexpr bool DoBenchmark        = true;
 
       if constexpr (DoBenchmark)
       {
-         constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE;
+         constexpr uint32_t benchWorkgroupSize      = WORKGROUP_SIZE;
          constexpr uint32_t totalThreadsPerDispatch = testBatchCount * benchWorkgroupSize;
-         constexpr uint32_t iterationsPerThread = BENCH_ITERS;
+         constexpr uint32_t iterationsPerThread     = BENCH_ITERS;
          constexpr uint32_t benchSamplesPerDispatch = totalThreadsPerDispatch * iterationsPerThread;
 
          struct BenchEntry
          {
             CSamplerBenchmark bench;
-            std::string name;
+            std::string sampler;
+            std::string mode;
          };
          std::vector<BenchEntry> benchmarks;
 
-         auto addBench = [&](const char* name, const std::string& shaderKey, size_t inputSize, size_t outputSize)
+         auto addBench = [&](const char* sampler, const char* mode, const std::string& shaderKey, size_t inputSize, size_t outputSize)
          {
-            auto& entry = benchmarks.emplace_back();
-            entry.name = name;
+            auto& entry   = benchmarks.emplace_back();
+            entry.sampler = sampler;
+            entry.mode    = mode;
 
             CSamplerBenchmark::SetupData data;
-            data.device = m_device;
-            data.api = m_api;
-            data.assetMgr = m_assetMgr;
-            data.logger = m_logger;
-            data.physicalDevice = m_physicalDevice;
+            data.device             = m_device;
+            data.api                = m_api;
+            data.assetMgr           = m_assetMgr;
+            data.logger             = m_logger;
+            data.physicalDevice     = m_physicalDevice;
             data.computeFamilyIndex = getComputeQueue()->getFamilyIndex();
-            data.shaderKey = shaderKey;
+            data.shaderKey          = shaderKey;
             data.dispatchGroupCount = testBatchCount;
             data.samplesPerDispatch = benchSamplesPerDispatch;
-            data.inputBufferBytes = inputSize;
-            data.outputBufferBytes = outputSize;
+            data.inputBufferBytes   = inputSize;
+            data.outputBufferBytes  = outputSize;
             entry.bench.setup(data);
          };
 
          // Bench shaders don't read input (hardcoded values) and write a single uint32_t per thread via RWByteAddressBuffer
-         constexpr size_t benchInputBytes = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks
+         constexpr size_t benchInputBytes  = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks
          constexpr size_t benchOutputBytes = sizeof(uint32_t) * totalThreadsPerDispatch;
-         addBench("Linear", nbl::this_example::builtin::build::get_spirv_key<"linear_bench">(m_device.get()), benchInputBytes, benchOutputBytes);
-         addBench("Bilinear", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench">(m_device.get()), benchInputBytes, benchOutputBytes);
-         addBench("BoxMullerTransform", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench">(m_device.get()), benchInputBytes, benchOutputBytes);
-         addBench("UniformHemisphere", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes);
-         addBench("UniformSphere", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes);
-         addBench("ConcentricMapping", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench">(m_device.get()), benchInputBytes, benchOutputBytes);
-         addBench("PolarMapping", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench">(m_device.get()), benchInputBytes, benchOutputBytes);
-         addBench("ProjectedHemisphere", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes);
-         addBench("ProjectedSphere", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes);
-         addBench("SphericalRectangle", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes);
-         addBench("ProjectedSphericalRectangle", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes);
-         addBench("SphericalTriangle", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes);
-         addBench("ProjectedSphericalTriangle", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("Linear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("Linear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("Bilinear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("Bilinear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("BoxMullerTransform", "1:1", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("BoxMullerTransform", "1:16", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("UniformHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("UniformHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("UniformSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("UniformSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("ConcentricMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("ConcentricMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("PolarMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("PolarMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("ProjectedHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("ProjectedHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("ProjectedSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("ProjectedSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("SphericalRectangle", "1:1  (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("SphericalRectangle", "1:16 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("SphericalRectangle", "1:1  (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("SphericalRectangle", "1:16 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("SphericalRectangle", "1:1  (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("SphericalRectangle", "1:16 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("SphericalRectangle", "create-only (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("SphericalRectangle", "create-only (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("SphericalRectangle", "create-only (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("ProjectedSphericalRectangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("ProjectedSphericalRectangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("ProjectedSphericalRectangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("SphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("SphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("SphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("ProjectedSphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("ProjectedSphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+         //addBench("ProjectedSphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes);
 
          // Print all pipeline reports first
          for (auto& entry : benchmarks)
-            entry.bench.logPipelineReport(entry.name);
+            entry.bench.logPipelineReport(entry.sampler + " (" + entry.mode + ")");
 
          // Discrete sampler benchmark: alias table vs cumulative probability (BDA)
          {
             CDiscreteSamplerBenchmark::SetupData dsData;
-            dsData.device = m_device;
-            dsData.api = m_api;
-            dsData.assetMgr = m_assetMgr;
-            dsData.logger = m_logger;
-            dsData.physicalDevice = m_physicalDevice;
-            dsData.computeFamilyIndex = getComputeQueue()->getFamilyIndex();
-            dsData.aliasShaderKey = nbl::this_example::builtin::build::get_spirv_key<"alias_table_bench">(m_device.get());
-            dsData.cumProbShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get());
-            dsData.dispatchGroupCount = testBatchCount;
-            dsData.tableSize = 1024;
+            dsData.device               = m_device;
+            dsData.api                  = m_api;
+            dsData.assetMgr             = m_assetMgr;
+            dsData.logger               = m_logger;
+            dsData.physicalDevice       = m_physicalDevice;
+            dsData.computeFamilyIndex   = getComputeQueue()->getFamilyIndex();
+            dsData.aliasShaderKey       = nbl::this_example::builtin::build::get_spirv_key<"alias_table_bench">(m_device.get());
+            dsData.cumProbShaderKey     = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get());
+            dsData.cumProbYoloShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get());
+            dsData.dispatchGroupCount   = testBatchCount;
 
             CDiscreteSamplerBenchmark discreteBench;
             discreteBench.setup(dsData);
 
             // Then run all benchmarks here so the reports are at the top of the log, followed by timings
-            constexpr uint32_t warmupDispatches = 500;
-            constexpr uint32_t benchDispatches = 5000;
-            m_logger->log("=== GPU Sampler Benchmarks (%u dispatches, %u threads/dispatch, %u iters/thread, ps/sample is per all GPU threads) ===",
-               ILogger::ELL_PERFORMANCE, benchDispatches, totalThreadsPerDispatch, iterationsPerThread);
-            for (auto& entry : benchmarks)
-               entry.bench.run(entry.name, warmupDispatches, benchDispatches);
-
-            discreteBench.run(warmupDispatches, benchDispatches);
+            {
+               constexpr uint32_t warmupDispatches = 300;
+               constexpr uint32_t benchDispatches  = 1000;
+               m_logger->log("=== GPU Sampler Benchmarks (%u dispatches, %u threads/dispatch, %u iters/thread, ps/sample is per all GPU threads) ===",
+                  ILogger::ELL_PERFORMANCE, benchDispatches, totalThreadsPerDispatch, iterationsPerThread);
+               m_logger->log("            %-28s | %-38s | %12s | %12s | %12s",
+                  ILogger::ELL_PERFORMANCE, "Sampler", "Mode", "ps/sample", "GSamples/s", "ms total");
+               for (auto& entry : benchmarks)
+                  entry.bench.run(entry.sampler, entry.mode, warmupDispatches, benchDispatches);
+            }
+
+            {
+               // Sweep covers both the YOLO-vs-Comparator comparison (explicit points at
+               // N=100, 10k, 1M for wg=WORKGROUP_SIZE) and an alias-vs-CDF ramp from
+               // N=4 up to 32M in a roughly-power-of-8 progression.
+               const std::vector<uint32_t> discreteSizes = {
+                  4u,
+                  16u,
+                  32u,
+                  100u,
+                  128u,
+                  512u,
+                  8192u,
+                  10000u,
+                  131072u,
+                  1000000u,
+                  2097152u,
+                  16777216u,
+                  33554432u,
+               };
+
+               // Adaptive dispatch scheduler: pick dispatch counts so total wall-clock
+               // per sampler-per-N stays near 1.5 s. Cost model comes from the prior
+               // sweep (order-of-magnitude ps/sample vs N).
+               auto dispatchScheduler = [](uint32_t N) -> CDiscreteSamplerBenchmark::DispatchCounts
+               {
+                  double ps_per_sample;
+                  if      (N <     1000u)  ps_per_sample =   15.0;  // L1-resident
+                  else if (N <   100000u)  ps_per_sample =  100.0;  // L1/L2
+                  else if (N <  2000000u)  ps_per_sample = 1000.0;  // L2-edge
+                  else                     ps_per_sample = 8000.0;  // DRAM-bound
+
+                  constexpr double targetNs            = 1.5e9;                 // ~1.5 s per bench
+                  constexpr uint64_t samplesPerDispatch = uint64_t(WORKGROUP_SIZE) * uint64_t(testBatchCount) * uint64_t(BENCH_ITERS);
+                  const uint64_t targetSamples          = uint64_t((targetNs * 1000.0) / ps_per_sample);
+                  const uint32_t bench                  = std::max(10u, uint32_t(targetSamples / samplesPerDispatch));
+                  const uint32_t warmup                 = std::max(20u, bench / 10u);
+                  return {warmup, bench};
+               };
+
+               discreteBench.runSweep(discreteSizes, dispatchScheduler);
+            }
          }
       }
 
@@ -270,21 +342,20 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       // Runtime CPU/GPU comparison tests using ITester harness
       // ================================================================
       bool pass = true;
-      const uint32_t workgroupSize = WORKGROUP_SIZE;
 
       // generic lambda to run a GPU sampler test
       auto runSamplerTest = [&]<typename Tester>(const char* testName, auto spirvKey, const char* logFile)
       {
          m_logger->log("Running %s tests...", ILogger::ELL_INFO, testName);
          typename Tester::PipelineSetupData data;
-         data.device = m_device;
-         data.api = m_api;
-         data.assetMgr = m_assetMgr;
-         data.logger = m_logger;
-         data.physicalDevice = m_physicalDevice;
+         data.device             = m_device;
+         data.api                = m_api;
+         data.assetMgr           = m_assetMgr;
+         data.logger             = m_logger;
+         data.physicalDevice     = m_physicalDevice;
          data.computeFamilyIndex = getComputeQueue()->getFamilyIndex();
-         data.shaderKey = spirvKey;
-         Tester tester(testBatchCount, workgroupSize);
+         data.shaderKey          = spirvKey;
+         Tester tester(testBatchCount, WORKGROUP_SIZE);
          tester.setupPipeline(data);
          pass &= tester.performTestsAndVerifyResults(logFile);
       };
@@ -307,7 +378,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
          runSamplerTest.operator()<CProjectedSphericalRectangleTester>("ProjectedSphericalRectangle sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_test">(m_device.get()), "ProjectedSphericalRectangleTestLog.txt");
       }
 
-      if constexpr (true)
+      if constexpr (DoBenchmark)
       {
          // --- Discrete table construction (CPU) ---
          {
@@ -320,6 +391,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
          runSamplerTest.operator()<CAliasTableGPUTester>("AliasTable GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"alias_table_test">(m_device.get()), "AliasTableTestLog.txt");
          runSamplerTest.operator()<CCumulativeProbabilityGPUTester>("CumulativeProbability GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_test">(m_device.get()), "CumulativeProbabilityTestLog.txt");
       }
+      logJacobianSkipCounts(m_logger.get());
       if (pass)
          m_logger->log("All sampling tests PASSED.", ILogger::ELL_INFO);
       else
diff --git a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h
index 87aac65ba..32f0e3b28 100644
--- a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h
+++ b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h
@@ -52,6 +52,7 @@ class CAliasTableGPUTester final : public ITester<AliasTableInputValues, AliasTa
 			PdfCheck{"AliasTable::backwardPdf", &R::backwardPdf});
 
 		// Structural invariants
+		pass &= verifyTestValue("AliasTable::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 1e-4, 1e-4);
 		pass &= verifyTestValue("AliasTable::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-7, 1e-7);
 		pass &= verifyTestValue("AliasTable::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-7, 1e-7);
 
diff --git a/37_HLSLSamplingTests/tests/CBilinearTester.h b/37_HLSLSamplingTests/tests/CBilinearTester.h
index 68605e90a..739af4584 100644
--- a/37_HLSLSamplingTests/tests/CBilinearTester.h
+++ b/37_HLSLSamplingTests/tests/CBilinearTester.h
@@ -51,8 +51,9 @@ class CBilinearTester final : public ITester<BilinearInputValues, BilinearTestRe
 		VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
 			PdfCheck{"Bilinear::forwardPdf",  &R::forwardPdf},
 			PdfCheck{"Bilinear::backwardPdf", &R::backwardPdf});
-		pass &= verifyTestValue("Bilinear::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-4, 1e-4);
-		pass &= verifyTestValue("Bilinear::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-4, 1e-4);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "Bilinear::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 5e-2, 5e-2);
+		pass &= verifyTestValue("Bilinear::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-5, 1e-5);
+		pass &= verifyTestValue("Bilinear::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-5, 1e-5);
 
 		if (!pass && iteration < m_inputs.size())
 			logFailedInput(m_logger.get(), m_inputs[iteration]);
diff --git a/37_HLSLSamplingTests/tests/CBoxMullerTransformTester.h b/37_HLSLSamplingTests/tests/CBoxMullerTransformTester.h
index 917d5ab5e..29539a72b 100644
--- a/37_HLSLSamplingTests/tests/CBoxMullerTransformTester.h
+++ b/37_HLSLSamplingTests/tests/CBoxMullerTransformTester.h
@@ -53,6 +53,7 @@ class CBoxMullerTransformTester final : public ITester<BoxMullerTransformInputVa
 		pass &= verifyTestValue("BoxMullerTransform::jointPdf == pdf product", actual.backwardPdf, actual.separateBackwardPdf.x * actual.separateBackwardPdf.y, iteration, seed, testType, 1e-5, 1e-5);
 		// forwardPdf must return the same value stored in cache.pdf by generate
 		pass &= verifyTestValue("BoxMullerTransform::forwardPdf == cache.pdf", actual.forwardPdf, actual.cachedPdf, iteration, seed, testType, 1e-5, 1e-5);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "BoxMullerTransform::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 5e-2, 5e-2);
 		pass &= verifyTestValue("BoxMullerTransform::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-4, 1e-3);
 		pass &= verifyTestValue("BoxMullerTransform::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-4, 1e-3);
 		VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
diff --git a/37_HLSLSamplingTests/tests/CConcentricMappingTester.h b/37_HLSLSamplingTests/tests/CConcentricMappingTester.h
index 482dced04..3496e250d 100644
--- a/37_HLSLSamplingTests/tests/CConcentricMappingTester.h
+++ b/37_HLSLSamplingTests/tests/CConcentricMappingTester.h
@@ -46,7 +46,8 @@ class CConcentricMappingTester final : public ITester<ConcentricMappingInputValu
 			FieldCheck{"ConcentricMapping::forwardWeight",  &R::forwardWeight,  1e-5, 1e-5},
 			FieldCheck{"ConcentricMapping::backwardWeight", &R::backwardWeight, 1e-5, 1e-5});
 		pass &= verifyTestValue("ConcentricMapping::roundtripError", nbl::hlsl::float32_t2(0.0f, 0.0f), actual.roundtripError, iteration, seed, testType, 1e-5, 1e-5);
-		pass &= verifyTestValue("ConcentricMapping::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 1e-5, 1e-5);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "ConcentricMapping::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 4e-2, 4e-2);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "ConcentricMapping::inverseJacobianPdf", actual.backwardPdf, actual.inverseJacobianPdf, iteration, seed, testType, 4e-2, 4e-2);
 		pass &= verifyTestValue("ConcentricMapping::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-5, 1e-5);
 		VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
 			PdfCheck{"ConcentricMapping::forwardPdf",  &R::forwardPdf},
diff --git a/37_HLSLSamplingTests/tests/CCumulativeProbabilityGPUTester.h b/37_HLSLSamplingTests/tests/CCumulativeProbabilityGPUTester.h
index 4978012d7..0d86d873b 100644
--- a/37_HLSLSamplingTests/tests/CCumulativeProbabilityGPUTester.h
+++ b/37_HLSLSamplingTests/tests/CCumulativeProbabilityGPUTester.h
@@ -52,6 +52,7 @@ class CCumulativeProbabilityGPUTester final : public ITester<CumProbInputValues,
 			PdfCheck{"CumProb::backwardPdf", &R::backwardPdf});
 
 		// Structural invariants
+		pass &= verifyTestValue("CumProb::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 1e-4, 1e-4);
 		pass &= verifyTestValue("CumProb::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-7, 1e-7);
 		pass &= verifyTestValue("CumProb::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-7, 1e-7);
 
diff --git a/37_HLSLSamplingTests/tests/CLinearTester.h b/37_HLSLSamplingTests/tests/CLinearTester.h
index 631151f00..814fbb1d7 100644
--- a/37_HLSLSamplingTests/tests/CLinearTester.h
+++ b/37_HLSLSamplingTests/tests/CLinearTester.h
@@ -49,8 +49,9 @@ class CLinearTester final : public ITester<LinearInputValues, LinearTestResults,
 		VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
 			PdfCheck{"Linear::forwardPdf",  &R::forwardPdf},
 			PdfCheck{"Linear::backwardPdf", &R::backwardPdf});
-		pass &= verifyTestValue("Linear::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-4, 1e-5);
-		pass &= verifyTestValue("Linear::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-4, 1e-5);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "Linear::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 6e-2, 6e-2);
+		pass &= verifyTestValue("Linear::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-5, 1e-5);
+		pass &= verifyTestValue("Linear::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-5, 1e-5);
 
 		if (!pass && iteration < m_inputs.size())
 			logFailedInput(m_logger.get(), m_inputs[iteration]);
@@ -88,7 +89,7 @@ struct LinearPropertyConfig
 	{
 		using nbl::system::to_string;
 		logger->log("    coeffStart=%s coeffEnd=%s", nbl::system::ILogger::ELL_ERROR,
-			to_string(s.linearCoeffStart).c_str(), to_string(s.linearCoeffEnd).c_str());
+			to_string(s.normalizedCoeffStart).c_str(), to_string(s.normalizedCoeffEnd).c_str());
 	}
 };
 
@@ -140,7 +141,7 @@ struct LinearStressConfig
 	{
 		using nbl::system::to_string;
 		logger->log("    coeffStart=%s coeffEnd=%s", nbl::system::ILogger::ELL_ERROR,
-			to_string(s.linearCoeffStart).c_str(), to_string(s.linearCoeffEnd).c_str());
+			to_string(s.normalizedCoeffStart).c_str(), to_string(s.normalizedCoeffEnd).c_str());
 	}
 };
 
diff --git a/37_HLSLSamplingTests/tests/CPolarMappingTester.h b/37_HLSLSamplingTests/tests/CPolarMappingTester.h
index f7009176b..6c43f8877 100644
--- a/37_HLSLSamplingTests/tests/CPolarMappingTester.h
+++ b/37_HLSLSamplingTests/tests/CPolarMappingTester.h
@@ -46,7 +46,8 @@ class CPolarMappingTester final : public ITester<PolarMappingInputValues, PolarM
 			FieldCheck{"PolarMapping::forwardWeight",  &R::forwardWeight,  1e-5, 1e-5},
 			FieldCheck{"PolarMapping::backwardWeight", &R::backwardWeight, 1e-5, 1e-5});
 		pass &= verifyTestValue("PolarMapping::roundtripError", nbl::hlsl::float32_t2(0.0f, 0.0f), actual.roundtripError, iteration, seed, testType, 1e-5, 1e-5);
-		pass &= verifyTestValue("PolarMapping::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 1e-5, 1e-5);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "PolarMapping::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 9e-2, 9e-2);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "PolarMapping::inverseJacobianPdf", actual.backwardPdf, actual.inverseJacobianPdf, iteration, seed, testType, 1e-2, 1e-2);
 		pass &= verifyTestValue("PolarMapping::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-5, 1e-5);
 		VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
 			PdfCheck{"PolarMapping::forwardPdf",  &R::forwardPdf},
diff --git a/37_HLSLSamplingTests/tests/CProjectedHemisphereTester.h b/37_HLSLSamplingTests/tests/CProjectedHemisphereTester.h
index 5e065e526..7d62368f4 100644
--- a/37_HLSLSamplingTests/tests/CProjectedHemisphereTester.h
+++ b/37_HLSLSamplingTests/tests/CProjectedHemisphereTester.h
@@ -48,9 +48,10 @@ class CProjectedHemisphereTester final : public ITester<ProjectedHemisphereInput
 			FieldCheck{"ProjectedHemisphere::backwardWeight", &R::backwardWeight, 1e-4, 1e-4});
 		pass &= verifyTestValue("ProjectedHemisphere::forwardPdf == cache.pdf", actual.forwardPdf, actual.cachedPdf, iteration, seed, testType, 1e-5, 1e-5);
 		pass &= verifyTestValue("ProjectedHemisphere::roundtripError", nbl::hlsl::float32_t2(0.0f, 0.0f), actual.roundtripError, iteration, seed, testType, 5e-4, 1e-4);
-		pass &= verifyTestValue("ProjectedHemisphere::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 1e-4, 1e-4);
-		pass &= verifyTestValue("ProjectedHemisphere::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-4, 1e-4);
-		pass &= verifyTestValue("ProjectedHemisphere::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-4, 1e-4);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "ProjectedHemisphere::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 6e-2, 6e-2);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "ProjectedHemisphere::inverseJacobianPdf", actual.backwardPdf, actual.inverseJacobianPdf, iteration, seed, testType, 6e-2, 6e-2);
+		pass &= verifyTestValue("ProjectedHemisphere::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-7, 1e-7);
+		pass &= verifyTestValue("ProjectedHemisphere::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-7, 1e-7);
 		VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
 			PdfCheck{"ProjectedHemisphere::forwardPdf",  &R::forwardPdf},
 			PdfCheck{"ProjectedHemisphere::backwardPdf", &R::backwardPdf});
diff --git a/37_HLSLSamplingTests/tests/CProjectedSphereTester.h b/37_HLSLSamplingTests/tests/CProjectedSphereTester.h
index 1d2c59ae0..4095333f7 100644
--- a/37_HLSLSamplingTests/tests/CProjectedSphereTester.h
+++ b/37_HLSLSamplingTests/tests/CProjectedSphereTester.h
@@ -47,8 +47,9 @@ class CProjectedSphereTester final : public ITester<ProjectedSphereInputValues,
 			FieldCheck{"ProjectedSphere::forwardWeight",  &R::forwardWeight,  1e-5, 1e-5},
 			FieldCheck{"ProjectedSphere::backwardWeight", &R::backwardWeight, 1e-5, 1e-5});
 		pass &= verifyTestValue("ProjectedSphere::forwardPdf == cache.pdf", actual.forwardPdf, actual.cachedPdf, iteration, seed, testType, 1e-5, 1e-5);
-		pass &= verifyTestValue("ProjectedSphere::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-4, 1e-4);
-		pass &= verifyTestValue("ProjectedSphere::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-4, 1e-4);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "ProjectedSphere::jacobianProduct", 0.5f, actual.jacobianProduct, iteration, seed, testType, 6e-2, 6e-2);
+		pass &= verifyTestValue("ProjectedSphere::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-7, 1e-7);
+		pass &= verifyTestValue("ProjectedSphere::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-7, 1e-7);
 		VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
 			PdfCheck{"ProjectedSphere::forwardPdf",  &R::forwardPdf},
 			PdfCheck{"ProjectedSphere::backwardPdf", &R::backwardPdf});
diff --git a/37_HLSLSamplingTests/tests/CProjectedSphericalRectangleTester.h b/37_HLSLSamplingTests/tests/CProjectedSphericalRectangleTester.h
index 29c5cfb8d..21137d5eb 100644
--- a/37_HLSLSamplingTests/tests/CProjectedSphericalRectangleTester.h
+++ b/37_HLSLSamplingTests/tests/CProjectedSphericalRectangleTester.h
@@ -20,23 +20,18 @@ class CProjectedSphericalRectangleTester final : public ITester<ProjectedSpheric
    private:
    ProjectedSphericalRectangleInputValues generateInputTestValues() override
    {
-      std::uniform_real_distribution<float> sizeDist(0.5f, 3.0f);
       std::uniform_real_distribution<float> uDist(0.0f, 1.0f);
 
-      ProjectedSphericalRectangleInputValues input;
-      // Observer at origin, rect placed in front (negative Z) so the solid angle is valid.
-      input.observer = nbl::hlsl::float32_t3(0.0f, 0.0f, 0.0f);
-      const float width = sizeDist(getRandomEngine());
-      const float height = sizeDist(getRandomEngine());
-      input.rectOrigin = nbl::hlsl::float32_t3(0.0f, 0.0f, -2.0f);
-      input.right = nbl::hlsl::float32_t3(width, 0.0f, 0.0f);
-      input.up = nbl::hlsl::float32_t3(0.0f, height, 0.0f);
-
-      // Build shape to use centralized corner check
       nbl::hlsl::shapes::CompressedSphericalRectangle<nbl::hlsl::float32_t> compressed;
-      compressed.origin = input.rectOrigin;
-      compressed.right = input.right;
-      compressed.up = input.up;
+      nbl::hlsl::float32_t3 observer;
+      generateRandomRectangle(getRandomEngine(), compressed, observer);
+
+      ProjectedSphericalRectangleInputValues input;
+      input.observer = observer;
+      input.rectOrigin = compressed.origin;
+      input.right = compressed.right;
+      input.up = compressed.up;
+
       auto shape = nbl::hlsl::shapes::SphericalRectangle<nbl::hlsl::float32_t>::create(compressed);
 
       // Ensure the receiver normal has positive projection onto at least one vertex,
@@ -63,25 +58,25 @@ class CProjectedSphericalRectangleTester final : public ITester<ProjectedSpheric
       const size_t iteration, const uint32_t seed, TestType testType) override
    {
       bool pass = true;
+      // `backwardWeight` takes a 3D direction; `surfaceOffset` is reconstructed in the executor
+      // (bilinear warp + sphrect.generateLocalBasisXY - r0) so the [0, extents] bounds check and
+      // the generate-vs-referenceDirection consistency check still apply.
       VERIFY_FIELDS(pass, expected, actual, iteration, seed, testType,
-         FieldCheck {"ProjectedSphericalRectangle::generate",              &R::generated,     5e-1, 5e-3},
-         FieldCheck {"ProjectedSphericalRectangle::generateSurfaceOffset", &R::surfaceOffset, 5e-1, 5e-3},
+         FieldCheck {"ProjectedSphericalRectangle::generate",              &R::generated,     2e-2, 1e-2},
+         FieldCheck {"ProjectedSphericalRectangle::generateSurfaceOffset", &R::surfaceOffset, 2e-2, 1e-2},
          FieldCheck {"ProjectedSphericalRectangle::forwardPdf",            &R::forwardPdf,    5e-2, 1e-4},
-         FieldCheck {"ProjectedSphericalRectangle::backwardPdf",           &R::backwardPdf,   5e-2, 1e-4},
          FieldCheck {"ProjectedSphericalRectangle::forwardWeight",         &R::forwardWeight, 5e-2, 1e-4},
          FieldCheck {"ProjectedSphericalRectangle::backwardWeight",        &R::backwardWeight,5e-2, 1e-4});
       VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
-         PdfCheck {"ProjectedSphericalRectangle::forwardPdf", &R::forwardPdf},
-         PdfCheck {"ProjectedSphericalRectangle::backwardPdf", &R::backwardPdf});
-      pass &= verifyTestValue("ProjectedSphericalRectangle::pdf consistency", actual.forwardPdf, actual.backwardPdfAtGenerated, iteration, seed, testType, 5e-3, 1e-4);
-      pass &= verifyTestValue("ProjectedSphericalRectangle::weight consistency", actual.forwardWeight, actual.backwardWeightAtGenerated, iteration, seed, testType, 5e-3, 1e-4);
-
-      // surfaceOffset must land inside the rectangle
-      if (actual.surfaceOffset.x < 0.0f || actual.surfaceOffset.x > actual.extents.x ||
-         actual.surfaceOffset.y < 0.0f || actual.surfaceOffset.y > actual.extents.y)
+         PdfCheck {"ProjectedSphericalRectangle::forwardPdf", &R::forwardPdf});
+      VERIFY_JACOBIAN_OR_SKIP(pass, "ProjectedSphericalRectangle::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 5e-2, 5e-2);
+
+      constexpr float boundsEps = 1e-5f;
+      if (actual.surfaceOffset.x < -boundsEps || actual.surfaceOffset.x > actual.extents.x + boundsEps ||
+         actual.surfaceOffset.y < -boundsEps || actual.surfaceOffset.y > actual.extents.y + boundsEps)
       {
          pass = false;
-         printTestFail("ProjectedSphericalRectangle::generateSurfaceOffset (inside rect bounds)", actual.extents, actual.surfaceOffset, iteration, seed, testType, 0.0, 0.0);
+         printTestFail("ProjectedSphericalRectangle::generateSurfaceOffset (inside rect bounds)", actual.extents, actual.surfaceOffset, iteration, seed, testType, 0.0, boundsEps);
       }
 
       // generate must be unit length
@@ -90,7 +85,7 @@ class CProjectedSphericalRectangleTester final : public ITester<ProjectedSpheric
          pass &= verifyTestValue("ProjectedSphericalRectangle::generate (unit length)", dirLen, 1.0f, iteration, seed, testType, 1e-5, 1e-4);
       }
 
-      // generate must agree with generateSurfaceOffset (reference direction from normalized local point)
+      // generate must agree with the reference direction reconstructed from the surface point
       pass &= verifyTestValue("ProjectedSphericalRectangle::generate vs generateSurfaceOffset", actual.generated, actual.referenceDirection, iteration, seed, testType, 5e-5, 5e-3);
 
       if (!pass && iteration < m_inputs.size())
@@ -105,7 +100,7 @@ class CProjectedSphericalRectangleTester final : public ITester<ProjectedSpheric
 // --- Property test configs ---
 
 // Helper: create a ProjectedSphericalRectangle sampler from a random rectangle + normal
-inline nbl::hlsl::sampling::ProjectedSphericalRectangle<nbl::hlsl::float32_t> createProjectedRectSampler(
+inline nbl::hlsl::sampling::ProjectedSphericalRectangle<nbl::hlsl::float32_t, false> createProjectedRectSampler(
    std::mt19937& rng,
    nbl::hlsl::shapes::CompressedSphericalRectangle<nbl::hlsl::float32_t>& compressed,
    nbl::hlsl::float32_t3& observer,
@@ -121,15 +116,16 @@ inline nbl::hlsl::sampling::ProjectedSphericalRectangle<nbl::hlsl::float32_t> cr
       outNormal = generateRandomUnitVector(rng);
    } while (!anyRectCornerAboveHorizon(shape, observer, outNormal));
 
-   return sampling::ProjectedSphericalRectangle<float32_t>::create(shape, observer, outNormal, false);
+   return sampling::ProjectedSphericalRectangle<float32_t, false>::create(shape, observer, outNormal, false);
 }
 
 struct ProjectedSphericalRectanglePropertyConfig
 {
-   using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle<nbl::hlsl::float32_t>;
+   // UsePdfAsWeight=false so receiverNormal and projSolidAngle are populated for logSamplerInfo.
+   using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle<nbl::hlsl::float32_t, false>;
 
    static constexpr uint32_t numConfigurations = 200;
-   static constexpr uint32_t samplesPerConfig = 20000;
+   static constexpr uint32_t samplesPerConfig = 50000;
    static constexpr bool hasMCNormalization = true;
    static constexpr bool hasGridIntegration = false;
    static constexpr float64_t mcNormalizationRelTol = 0.08;
@@ -155,23 +151,20 @@ struct ProjectedSphericalRectanglePropertyConfig
    static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s)
    {
       using nbl::system::to_string;
-      logger->log("    r0=%s extents=%s solidAngle=%s rcpSolidAngle=%s rcpProjSolidAngle=%s",
+      logger->log("    r0=%s extents=%s solidAngle=%s projSolidAngle=%s receiverNormal=%s",
          nbl::system::ILogger::ELL_ERROR,
          to_string(s.sphrect.r0).c_str(),
          to_string(s.sphrect.extents).c_str(),
          to_string(s.sphrect.solidAngle).c_str(),
-         to_string(s.rcpSolidAngle).c_str(),
-         to_string(s.rcpProjSolidAngle).c_str());
-      logger->log("    localReceiverNormal=%s receiverWasBSDF=%u",
-         nbl::system::ILogger::ELL_ERROR,
-         to_string(s.localReceiverNormal).c_str(),
-         static_cast<uint32_t>(s.receiverWasBSDF));
+         to_string(s.projSolidAngle).c_str(),
+         to_string(s.receiverNormal).c_str());
    }
 };
 
 struct ProjectedSphericalRectangleGrazingConfig
 {
-   using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle<nbl::hlsl::float32_t>;
+   // UsePdfAsWeight=false so receiverNormal and projSolidAngle are populated for logSamplerInfo.
+   using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle<nbl::hlsl::float32_t, false>;
 
    static constexpr uint32_t numConfigurations = 200;
    static constexpr uint32_t samplesPerConfig = 20000;
@@ -202,17 +195,13 @@ struct ProjectedSphericalRectangleGrazingConfig
    static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s)
    {
       using nbl::system::to_string;
-      logger->log("    r0=%s extents=%s solidAngle=%s rcpSolidAngle=%s rcpProjSolidAngle=%s",
+      logger->log("    r0=%s extents=%s solidAngle=%s projSolidAngle=%s receiverNormal=%s",
          nbl::system::ILogger::ELL_ERROR,
          to_string(s.sphrect.r0).c_str(),
          to_string(s.sphrect.extents).c_str(),
          to_string(s.sphrect.solidAngle).c_str(),
-         to_string(s.rcpSolidAngle).c_str(),
-         to_string(s.rcpProjSolidAngle).c_str());
-      logger->log("    localReceiverNormal=%s receiverWasBSDF=%u",
-         nbl::system::ILogger::ELL_ERROR,
-         to_string(s.localReceiverNormal).c_str(),
-         static_cast<uint32_t>(s.receiverWasBSDF));
+         to_string(s.projSolidAngle).c_str(),
+         to_string(s.receiverNormal).c_str());
    }
 };
 
diff --git a/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h b/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h
index 31f85ba02..0460a30ee 100644
--- a/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h
+++ b/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h
@@ -60,17 +60,19 @@ class CProjectedSphericalTriangleTester final : public ITester<ProjectedSpherica
 		// and GPU/CPU trig differences are amplified by rcpProjSolidAngle.
 		// Bilinear CDF inversion near domain boundaries (u~0 or u~1) amplifies
 		// CPU/GPU FP differences, producing up to ~0.003 absolute error in generate.
+		// Weight self-consistency is tested via backwardWeightAtGenerated (backwardWeight takes a
+		// 3D direction; evaluate at the triangle centroid for a deterministic interior point).
 		VERIFY_FIELDS(pass, expected, actual, iteration, seed, testType,
-			FieldCheck{"ProjectedSphericalTriangle::generate",    &R::generated,   2e-1, 3e-3},
-			FieldCheck{"ProjectedSphericalTriangle::forwardPdf",  &R::forwardPdf,  5e-2, 1e-4},
-			FieldCheck{"ProjectedSphericalTriangle::backwardPdf", &R::backwardPdf, 5e-2, 1e-4},
+			FieldCheck{"ProjectedSphericalTriangle::generate",       &R::generated,      2e-1, 3e-3},
+			FieldCheck{"ProjectedSphericalTriangle::forwardPdf",     &R::forwardPdf,     5e-2, 1e-4},
 			FieldCheck{"ProjectedSphericalTriangle::forwardWeight",  &R::forwardWeight,  5e-2, 1e-4},
 			FieldCheck{"ProjectedSphericalTriangle::backwardWeight", &R::backwardWeight, 5e-2, 1e-4});
 		VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
-			PdfCheck{"ProjectedSphericalTriangle::forwardPdf",  &R::forwardPdf},
-			PdfCheck{"ProjectedSphericalTriangle::backwardPdf", &R::backwardPdf});
-		pass &= verifyTestValue("ProjectedSphericalTriangle::pdf consistency", actual.forwardPdf, actual.backwardPdfAtGenerated, iteration, seed, testType, 0.015, 8e-3);
-		pass &= verifyTestValue("ProjectedSphericalTriangle::weight consistency", actual.forwardWeight, actual.backwardWeightAtGenerated, iteration, seed, testType, 0.015, 8e-3);
+			PdfCheck{"ProjectedSphericalTriangle::forwardPdf", &R::forwardPdf});
+		// TODO: we're not chasing this further but we have sinZ ~= sqrt(u.y) parameterization in the
+		// underlying SphericalTriangle (Arvo) which cascades through the bilinear warp at small SA.
+		VERIFY_JACOBIAN_OR_SKIP(pass, "ProjectedSphericalTriangle::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 2.0, 2.0);
+		pass &= verifyTestValue("ProjectedSphericalTriangle::weight consistency", actual.forwardWeight, actual.backwardWeightAtGenerated, iteration, seed, testType, 5e-2, 2e-2);
 
 		if (!pass && iteration < m_inputs.size())
 			logFailedInput(m_logger.get(), m_inputs[iteration]);
@@ -84,7 +86,8 @@ class CProjectedSphericalTriangleTester final : public ITester<ProjectedSpherica
 // --- Property test configs ---
 struct ProjectedSphericalTrianglePropertyConfig
 {
-	using sampler_type = nbl::hlsl::sampling::ProjectedSphericalTriangle<nbl::hlsl::float32_t>;
+	// UsePdfAsWeight=false so receiverNormal is populated for logSamplerInfo.
+	using sampler_type = nbl::hlsl::sampling::ProjectedSphericalTriangle<nbl::hlsl::float32_t, false>;
 
 	static constexpr uint32_t numConfigurations = 200;
 	static constexpr uint32_t samplesPerConfig = 20000;
@@ -117,18 +120,19 @@ struct ProjectedSphericalTrianglePropertyConfig
 	// E[1/pdf] = solidAngle * E[1/bilinearPdf] = solidAngle * 1.0 = solidAngle
 	static float64_t expectedCodomainMeasure(const sampler_type& s)
 	{
-		return 1.0 / static_cast<float64_t>(s.sphtri.base.rcpSolidAngle);
+		return 1.0 / static_cast<float64_t>(s.sphtri.rcpSolidAngle);
 	}
 
 	static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s)
 	{
-		logTriangleInfo(logger, s.sphtri.base.tri_vertices[0], s.sphtri.base.tri_vertices[1], s.sphtri.vertexC, s.receiverNormal);
+		logTriangleInfo(logger, s.sphtri.tri_vertices[0], s.sphtri.tri_vertices[1], s.sphtri.APlusC - s.sphtri.tri_vertices[0], s.receiverNormal);
 	}
 };
 
 struct ProjectedSphericalTriangleGrazingConfig
 {
-	using sampler_type = nbl::hlsl::sampling::ProjectedSphericalTriangle<nbl::hlsl::float32_t>;
+	// UsePdfAsWeight=false so receiverNormal is populated for logSamplerInfo.
+	using sampler_type = nbl::hlsl::sampling::ProjectedSphericalTriangle<nbl::hlsl::float32_t, false>;
 
 	static constexpr uint32_t numConfigurations = 200;
 	static constexpr uint32_t samplesPerConfig = 20000;
@@ -169,12 +173,12 @@ struct ProjectedSphericalTriangleGrazingConfig
 
 	static float64_t expectedCodomainMeasure(const sampler_type& s)
 	{
-		return 1.0 / static_cast<float64_t>(s.sphtri.base.rcpSolidAngle);
+		return 1.0 / static_cast<float64_t>(s.sphtri.rcpSolidAngle);
 	}
 
 	static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s)
 	{
-		logTriangleInfo(logger, s.sphtri.base.tri_vertices[0], s.sphtri.base.tri_vertices[1], s.sphtri.vertexC, s.receiverNormal);
+		logTriangleInfo(logger, s.sphtri.tri_vertices[0], s.sphtri.tri_vertices[1], s.sphtri.APlusC - s.sphtri.tri_vertices[0], s.receiverNormal);
 	}
 };
 
diff --git a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h
index 2a6030b78..fa5c93ccb 100644
--- a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h
+++ b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h
@@ -20,17 +20,17 @@ class CSphericalRectangleTester final : public ITester<SphericalRectangleInputVa
 private:
 	SphericalRectangleInputValues generateInputTestValues() override
 	{
-		std::uniform_real_distribution<float> sizeDist(0.5f, 3.0f);
 		std::uniform_real_distribution<float> uDist(0.0f, 1.0f);
 
+		nbl::hlsl::shapes::CompressedSphericalRectangle<nbl::hlsl::float32_t> compressed;
+		nbl::hlsl::float32_t3 observer;
+		generateRandomRectangle(getRandomEngine(), compressed, observer);
+
 		SphericalRectangleInputValues input;
-		// Observer at origin, rect placed in front (negative Z) so the solid angle is valid.
-		input.observer = nbl::hlsl::float32_t3(0.0f, 0.0f, 0.0f);
-		const float width = sizeDist(getRandomEngine());
-		const float height = sizeDist(getRandomEngine());
-		input.rectOrigin = nbl::hlsl::float32_t3(0.0f, 0.0f, -2.0f);
-		input.right = nbl::hlsl::float32_t3(width, 0.0f, 0.0f);
-		input.up = nbl::hlsl::float32_t3(0.0f, height, 0.0f);
+		input.observer = observer;
+		input.rectOrigin = compressed.origin;
+		input.right = compressed.right;
+		input.up = compressed.up;
 		input.u = nbl::hlsl::float32_t2(uDist(getRandomEngine()), uDist(getRandomEngine()));
 		m_inputs.push_back(input);
 		return input;
@@ -48,16 +48,21 @@ class CSphericalRectangleTester final : public ITester<SphericalRectangleInputVa
 		const size_t iteration, const uint32_t seed, TestType testType) override
 	{
 		bool pass = true;
+		// Tolerances reflect GPU-vs-CPU fp32 divergence on an identical algorithm: `solidAngle` is
+		// built from basis dot products, 4 rsqrts, and one acos; GPU fuses these into FMA chains
+		// while CPU doesn't, so small-angle cases (large 1/solidAngle) drift by a few ulps on the
+		// divisor, amplified in the reciprocal.
 		VERIFY_FIELDS(pass, expected, actual, iteration, seed, testType,
-			FieldCheck{"SphericalRectangle::generate",              &R::generated,      5e-5, 5e-3},
-			FieldCheck{"SphericalRectangle::generateSurfaceOffset", &R::surfaceOffset,  5e-5, 5e-3},
-			FieldCheck{"SphericalRectangle::forwardPdf",            &R::forwardPdf,     1e-5, 5e-4},
-			FieldCheck{"SphericalRectangle::backwardPdf",           &R::backwardPdf,    1e-5, 5e-4},
-			FieldCheck{"SphericalRectangle::forwardWeight",         &R::forwardWeight,  1e-5, 5e-4},
-			FieldCheck{"SphericalRectangle::backwardWeight",        &R::backwardWeight, 1e-5, 5e-4});
+			FieldCheck{"SphericalRectangle::generate",              &R::generated,      5e-4, 2e-2},
+			FieldCheck{"SphericalRectangle::generateSurfaceOffset", &R::surfaceOffset,  5e-4, 2e-2},
+			FieldCheck{"SphericalRectangle::forwardPdf",            &R::forwardPdf,     2e-3, 1e-1},
+			FieldCheck{"SphericalRectangle::backwardPdf",           &R::backwardPdf,    2e-3, 1e-1},
+			FieldCheck{"SphericalRectangle::forwardWeight",         &R::forwardWeight,  2e-3, 1e-1},
+			FieldCheck{"SphericalRectangle::backwardWeight",        &R::backwardWeight, 2e-3, 1e-1});
 		VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
 			PdfCheck{"SphericalRectangle::forwardPdf",  &R::forwardPdf},
 			PdfCheck{"SphericalRectangle::backwardPdf", &R::backwardPdf});
+		VERIFY_JACOBIAN_OR_SKIP(pass, "SphericalRectangle::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 4e-2, 4e-2);
 		pass &= verifyTestValue("SphericalRectangle::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-7, 1e-7);
 		pass &= verifyTestValue("SphericalRectangle::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-7, 1e-7);
 
diff --git a/37_HLSLSamplingTests/tests/CSphericalTriangleTester.h b/37_HLSLSamplingTests/tests/CSphericalTriangleTester.h
index fd8a0f63e..e1c68acc1 100644
--- a/37_HLSLSamplingTests/tests/CSphericalTriangleTester.h
+++ b/37_HLSLSamplingTests/tests/CSphericalTriangleTester.h
@@ -61,7 +61,10 @@ class CSphericalTriangleTester final : public ITester<SphericalTriangleInputValu
 			FieldCheck{"SphericalTriangle::backwardWeight", &R::backwardWeight, 2e-4, 1e-4},
 			FieldCheck{"SphericalTriangle::inverted",       &R::inverted,       1e-4, 5e-3});
 		pass &= verifyTestValue("SphericalTriangle::roundtripError", nbl::hlsl::float32_t2(0.0f, 0.0f), actual.roundtripError, iteration, seed, testType, 1e-4, 5e-3);
-		pass &= verifyTestValue("SphericalTriangle::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 1e-4, 1e-4);
+		// TODO: we're not chasing this further but we have sinZ ~= sqrt(u.y) parameterization in the
+		// Arvo ST sampler, so O(h) forward diff has O(h/u.y) bias that no fixed eps can fully resolve.
+		VERIFY_JACOBIAN_OR_SKIP(pass, "SphericalTriangle::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 2.0, 2.0);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "SphericalTriangle::inverseJacobianPdf", actual.backwardPdf, actual.inverseJacobianPdf, iteration, seed, testType, 3.0, 3.0);
 		pass &= verifyTestValue("SphericalTriangle::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-7, 1e-7);
 		pass &= verifyTestValue("SphericalTriangle::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-7, 1e-7);
 		VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
@@ -93,7 +96,7 @@ class CSphericalTriangleTester final : public ITester<SphericalTriangleInputValu
 // --- Property test config ---
 struct SphericalTrianglePropertyConfig
 {
-	using sampler_type = nbl::hlsl::sampling::SphericalTriangle<nbl::hlsl::float32_t, true>;
+	using sampler_type = nbl::hlsl::sampling::SphericalTriangle<nbl::hlsl::float32_t>;
 
 	static constexpr uint32_t numConfigurations = 500;
 	static constexpr uint32_t samplesPerConfig = 20000;
@@ -121,7 +124,7 @@ struct SphericalTrianglePropertyConfig
 
 	static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s)
 	{
-		logTriangleInfo(logger, s.base.tri_vertices[0], s.base.tri_vertices[1], s.vertexC);
+		logTriangleInfo(logger, s.tri_vertices[0], s.tri_vertices[1], s.APlusC - s.tri_vertices[0]);
 	}
 };
 
@@ -130,7 +133,7 @@ struct SphericalTrianglePropertyConfig
 // These stress the C_s great-circle intersection and v-recovery in generateInverse.
 struct SphericalTriangleStressConfig
 {
-	using sampler_type = nbl::hlsl::sampling::SphericalTriangle<nbl::hlsl::float32_t, true>;
+	using sampler_type = nbl::hlsl::sampling::SphericalTriangle<nbl::hlsl::float32_t>;
 
 	static constexpr uint32_t numConfigurations = 500;
 	static constexpr uint32_t samplesPerConfig = 20000;
@@ -218,7 +221,7 @@ struct SphericalTriangleStressConfig
 
 	static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s)
 	{
-		logTriangleInfo(logger, s.base.tri_vertices[0], s.base.tri_vertices[1], s.vertexC);
+		logTriangleInfo(logger, s.tri_vertices[0], s.tri_vertices[1], s.APlusC - s.tri_vertices[0]);
 	}
 };
 
diff --git a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h
index 29994511f..4f2ae08a4 100644
--- a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h
+++ b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h
@@ -45,7 +45,8 @@ class CUniformHemisphereTester final : public ITester<UniformHemisphereInputValu
 			FieldCheck{"UniformHemisphere::forwardWeight",  &R::forwardWeight,  1e-5, 1e-5},
 			FieldCheck{"UniformHemisphere::backwardWeight", &R::backwardWeight, 1e-5, 1e-5});
 		pass &= verifyTestValue("UniformHemisphere::roundtripError", nbl::hlsl::float32_t2(0.0f, 0.0f), actual.roundtripError, iteration, seed, testType, 0.0, 1e-4);
-		pass &= verifyTestValue("UniformHemisphere::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 1e-4, 1e-4);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "UniformHemisphere::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 5e-2, 5e-2);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "UniformHemisphere::inverseJacobianPdf", actual.backwardPdf, actual.inverseJacobianPdf, iteration, seed, testType, 5e-2, 5e-2);
 		pass &= verifyTestValue("UniformHemisphere::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-7, 1e-7);
 		pass &= verifyTestValue("UniformHemisphere::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-7, 1e-7);
 		VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
diff --git a/37_HLSLSamplingTests/tests/CUniformSphereTester.h b/37_HLSLSamplingTests/tests/CUniformSphereTester.h
index 732ac57d8..4cca917e7 100644
--- a/37_HLSLSamplingTests/tests/CUniformSphereTester.h
+++ b/37_HLSLSamplingTests/tests/CUniformSphereTester.h
@@ -45,7 +45,8 @@ class CUniformSphereTester final : public ITester<UniformSphereInputValues, Unif
 			FieldCheck{"UniformSphere::forwardWeight",  &R::forwardWeight,  1e-5, 1e-5},
 			FieldCheck{"UniformSphere::backwardWeight", &R::backwardWeight, 1e-5, 1e-5});
 		pass &= verifyTestValue("UniformSphere::roundtripError", nbl::hlsl::float32_t2(0.0f, 0.0f), actual.roundtripError, iteration, seed, testType, 0.0, 1e-4);
-		pass &= verifyTestValue("UniformSphere::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 1e-4, 1e-4);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "UniformSphere::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 5e-2, 5e-2);
+		VERIFY_JACOBIAN_OR_SKIP(pass, "UniformSphere::inverseJacobianPdf", actual.backwardPdf, actual.inverseJacobianPdf, iteration, seed, testType, 5e-2, 5e-2);
 		pass &= verifyTestValue("UniformSphere::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-7, 1e-7);
 		pass &= verifyTestValue("UniformSphere::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-7, 1e-7);
 		VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
diff --git a/37_HLSLSamplingTests/tests/SamplerTestHelpers.h b/37_HLSLSamplingTests/tests/SamplerTestHelpers.h
index b7891f26d..44dd5f961 100644
--- a/37_HLSLSamplingTests/tests/SamplerTestHelpers.h
+++ b/37_HLSLSamplingTests/tests/SamplerTestHelpers.h
@@ -34,30 +34,126 @@ struct PdfCheck
 
 // Verify expected.*field vs actual.*field for each FieldCheck.
 // Must be called from within a method that has access to verifyTestValue.
-#define VERIFY_FIELDS(pass, expected, actual, iteration, seed, testType, ...) \
-   do \
-   { \
-      auto _checks = std::make_tuple(__VA_ARGS__); \
-      std::apply([&](const auto&... c) { ((pass &= verifyTestValue(c.name, (expected).*c.field, (actual).*c.field, \
-                                              iteration, seed, testType, c.relTol, c.absTol)), \
+#define VERIFY_FIELDS(pass, expected, actual, iteration, seed, testType, ...)                                                                                                          \
+   do                                                                                                                                                                                  \
+   {                                                                                                                                                                                   \
+      auto _checks = std::make_tuple(__VA_ARGS__);                                                                                                                                     \
+      std::apply([&](const auto&... c) { ((pass &= verifyTestValue(c.name, (expected).*c.field, (actual).*c.field,                                                                     \
+                                              iteration, seed, testType, c.relTol, c.absTol)),                                                                                         \
                                             ...); }, _checks); \
    } while (0)
 
+// ============================================================================
+// Jacobian skip tracking
+//
+// The device-side sampler writes a reason-encoded skip sentinel (see
+// jacobian_test.hlsl) instead of a jacobianProduct value when it cannot test
+// a sample honestly. The host recognizes the sentinel, bins it by reason,
+// and NEVER counts it as a pass. After all tests run, logJacobianSkipCounts()
+// reports per-reason counts so nothing silently inflates pass rates.
+// ============================================================================
+
+namespace detail
+{
+struct JacobianStats
+{
+   uint64_t total                   = 0; // total VERIFY_JACOBIAN_OR_SKIP invocations (= samples evaluated)
+   uint64_t skipUDomain             = 0; // JACOBIAN_SKIP_U_DOMAIN             = -1.0f
+   uint64_t skipCrease              = 0; // JACOBIAN_SKIP_CREASE               = -2.0f
+   uint64_t skipHemiBoundary        = 0; // JACOBIAN_SKIP_HEMI_BOUNDARY        = -3.0f
+   uint64_t skipBwdPdfRange         = 0; // JACOBIAN_SKIP_BWD_PDF_RANGE        = -4.0f
+   uint64_t skipCodomainSingularity = 0; // JACOBIAN_SKIP_CODOMAIN_SINGULARITY = -5.0f
+};
+
+inline nbl::core::map<nbl::core::string, JacobianStats>& jacobianStats()
+{
+   static nbl::core::map<nbl::core::string, JacobianStats> s;
+   return s;
+}
+} // namespace detail
+
+inline void logJacobianSkipCounts(nbl::system::ILogger* logger)
+{
+   auto& stats = detail::jacobianStats();
+   if (stats.empty())
+      return;
+   logger->log("Jacobian skip summary (skipped samples are NOT counted as passes):", nbl::system::ILogger::ELL_INFO);
+   for (const auto& [name, s] : stats)
+   {
+      const uint64_t skipped = s.skipUDomain + s.skipCrease + s.skipHemiBoundary + s.skipBwdPdfRange + s.skipCodomainSingularity;
+      if (skipped == 0)
+         continue;
+      const double percentage = s.total ? (100.0 * double(skipped) / double(s.total)) : 0.0;
+      logger->log("  [JacobianSkip] %s: %llu / %llu skipped (%.2f%%) -- u-domain=%llu, crease=%llu, hemi-boundary=%llu, bwd-pdf-range=%llu, codomain-singularity=%llu",
+         nbl::system::ILogger::ELL_WARNING,
+         name.c_str(),
+         skipped,
+         s.total,
+         percentage,
+         s.skipUDomain,
+         s.skipCrease,
+         s.skipHemiBoundary,
+         s.skipBwdPdfRange,
+         s.skipCodomainSingularity);
+   }
+}
+
+// Verify a jacobianProduct value OR bin it by reason if it is a skip sentinel (< 0).
+// Skipped samples are counted by reason and NEVER counted as a pass.
+// Must be called from a method that has access to verifyTestValue.
+#define VERIFY_JACOBIAN_OR_SKIP(pass, name, expected, actual, iteration, seed, testType, relTol, absTol)          \
+   do                                                                                                             \
+   {                                                                                                              \
+      auto& _jstats = detail::jacobianStats()[(name)];                                                            \
+      ++_jstats.total;                                                                                            \
+      const float _jval = (actual);                                                                               \
+      if (_jval < 0.0f)                                                                                           \
+      {                                                                                                           \
+         /* Sentinel values are integers at -1..-5, so round-to-nearest on _jval picks the bin. */                \
+         const int _bin = static_cast<int>(-_jval + 0.5f);                                                        \
+         switch (_bin)                                                                                            \
+         {                                                                                                        \
+            case 1:                                                                                               \
+               ++_jstats.skipUDomain;                                                                             \
+               break;                                                                                             \
+            case 2:                                                                                               \
+               ++_jstats.skipCrease;                                                                              \
+               break;                                                                                             \
+            case 3:                                                                                               \
+               ++_jstats.skipHemiBoundary;                                                                        \
+               break;                                                                                             \
+            case 4:                                                                                               \
+               ++_jstats.skipBwdPdfRange;                                                                         \
+               break;                                                                                             \
+            case 5:                                                                                               \
+               ++_jstats.skipCodomainSingularity;                                                                 \
+               break;                                                                                             \
+            default:                                                                                              \
+               ++_jstats.skipUDomain;                                                                             \
+               break; /* fall-through bucket */                                                                   \
+         }                                                                                                        \
+      }                                                                                                           \
+      else                                                                                                        \
+      {                                                                                                           \
+         pass &= verifyTestValue((name), (expected), _jval, (iteration), (seed), (testType), (relTol), (absTol)); \
+      }                                                                                                           \
+   } while (0)
+
 // Check that each PDF field is positive and finite.
 // Must be called from within a method that has access to printTestFail.
-#define VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType, ...) \
-   do \
-   { \
-      auto _pdfChecks = std::make_tuple(__VA_ARGS__); \
-      std::apply([&](const auto&... c) { (([&] { \
+#define VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType, ...)                                        \
+   do                                                                                                             \
+   {                                                                                                              \
+      auto _pdfChecks = std::make_tuple(__VA_ARGS__);                                                             \
+      std::apply([&](const auto&... c) { (([&] {                                                                  \
                                             if (!((actual).*c.field > 0.0f) || !std::isfinite((actual).*c.field)) \
-                                            { \
-                                               pass = false; \
-                                               printTestFail(std::string(c.name) + " (positive & finite)", \
-                                                  1.0f, (actual).*c.field, iteration, seed, testType, 0.0, 0.0); \
-                                            } \
-                                         }()), \
-                                            ...); }, _pdfChecks); \
+                                            {                                                                     \
+                                               pass = false;                                                      \
+                                               printTestFail(std::string(c.name) + " (positive & finite)",        \
+                                                  1.0f, (actual).*c.field, iteration, seed, testType, 0.0, 0.0);  \
+                                            }                                                                     \
+                                         }()),                                                                    \
+                                            ...); }, _pdfChecks);                                        \
    } while (0)
 
 // ============================================================================
@@ -139,7 +235,7 @@ inline float64_t gridIntegratePdf1D(const auto& sampler, uint32_t N = 100000)
 // 2D grid integration of backwardPdf over [0,1]^2
 inline float64_t gridIntegratePdf2D(const auto& sampler, uint32_t N = 1000)
 {
-   float64_t sum = 0.0;
+   float64_t sum            = 0.0;
    const float64_t cellArea = 1.0 / static_cast<float64_t>(N * N);
    for (uint32_t iy = 0; iy < N; iy++)
    {
@@ -190,17 +286,15 @@ inline void buildTangentFrame(nbl::hlsl::float32_t3 dir, nbl::hlsl::float32_t3&
 
 // Generate a small equilateral triangle on the unit sphere around baseDir with given half-angle.
 // Also generates a random normal with decent projection onto the triangle.
-inline void generateSmallTriangle(std::mt19937& rng, float halfAngle,
-   nbl::hlsl::float32_t3& v0, nbl::hlsl::float32_t3& v1, nbl::hlsl::float32_t3& v2,
-   nbl::hlsl::float32_t3& baseDir, nbl::hlsl::float32_t3& normal)
+inline void generateSmallTriangle(std::mt19937& rng, float halfAngle, nbl::hlsl::float32_t3& v0, nbl::hlsl::float32_t3& v1, nbl::hlsl::float32_t3& v2, nbl::hlsl::float32_t3& baseDir, nbl::hlsl::float32_t3& normal)
 {
    using namespace nbl::hlsl;
    baseDir = generateRandomUnitVector(rng);
    float32_t3 t1, t2;
    buildTangentFrame(baseDir, t1, t2);
-   v0 = normalize(baseDir + t1 * halfAngle);
-   v1 = normalize(baseDir - t1 * (halfAngle * 0.5f) + t2 * (halfAngle * 0.866f));
-   v2 = normalize(baseDir - t1 * (halfAngle * 0.5f) - t2 * (halfAngle * 0.866f));
+   v0     = normalize(baseDir + t1 * halfAngle);
+   v1     = normalize(baseDir - t1 * (halfAngle * 0.5f) + t2 * (halfAngle * 0.866f));
+   v2     = normalize(baseDir - t1 * (halfAngle * 0.5f) - t2 * (halfAngle * 0.866f));
    normal = generateRandomUnitVector(rng);
    if (dot(normal, baseDir) < 0.1f)
       normal = normalize(normal + baseDir * 2.0f);
@@ -221,10 +315,10 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32
             float32_t3 t1, t2;
             buildTangentFrame(base, t1, t2);
             float spread = 0.15f + angleDist(rng) * 0.2f;
-            v0 = normalize(base + t1 * spread);
-            v1 = normalize(base - t1 * spread);
-            float far_ = 0.8f + angleDist(rng) * 0.8f;
-            v2 = normalize(base * std::cos(far_) + t2 * std::sin(far_));
+            v0           = normalize(base + t1 * spread);
+            v1           = normalize(base - t1 * spread);
+            float far_   = 0.8f + angleDist(rng) * 0.8f;
+            v2           = normalize(base * std::cos(far_) + t2 * std::sin(far_));
             break;
          }
       case 1: // Nearly coplanar
@@ -233,12 +327,12 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32
             float32_t3 t1, t2;
             buildTangentFrame(pole, t1, t2);
             float offset = 0.05f + angleDist(rng) * 0.1f;
-            float a1 = angleDist(rng) * 6.2832f;
-            float a2 = a1 + 0.8f + angleDist(rng);
-            float a3 = a2 + 0.8f + angleDist(rng);
-            v0 = normalize(t1 * std::cos(a1) + t2 * std::sin(a1) + pole * offset);
-            v1 = normalize(t1 * std::cos(a2) + t2 * std::sin(a2) - pole * offset * 0.5f);
-            v2 = normalize(t1 * std::cos(a3) + t2 * std::sin(a3) + pole * offset * 0.3f);
+            float a1     = angleDist(rng) * 6.2832f;
+            float a2     = a1 + 0.8f + angleDist(rng);
+            float a3     = a2 + 0.8f + angleDist(rng);
+            v0           = normalize(t1 * std::cos(a1) + t2 * std::sin(a1) + pole * offset);
+            v1           = normalize(t1 * std::cos(a2) + t2 * std::sin(a2) - pole * offset * 0.5f);
+            v2           = normalize(t1 * std::cos(a3) + t2 * std::sin(a3) + pole * offset * 0.3f);
             break;
          }
       default: // One short edge
@@ -247,9 +341,9 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32
             float32_t3 t1, t2;
             buildTangentFrame(base, t1, t2);
             float shortAngle = 0.32f + angleDist(rng) * 0.1f;
-            v0 = normalize(base + t1 * shortAngle * 0.5f);
-            v1 = normalize(base - t1 * shortAngle * 0.5f);
-            v2 = normalize(t2 + base * (0.3f + angleDist(rng) * 0.5f));
+            v0               = normalize(base + t1 * shortAngle * 0.5f);
+            v1               = normalize(base - t1 * shortAngle * 0.5f);
+            v2               = normalize(t2 + base * (0.3f + angleDist(rng) * 0.5f));
             break;
          }
    }
@@ -262,65 +356,114 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32
 inline void makeEquilateralTriangle(float64_t theta, nbl::hlsl::float32_t3 verts[3])
 {
    using namespace nbl::hlsl;
-   const float32_t st = static_cast<float32_t>(std::sin(theta));
-   const float32_t ct = static_cast<float32_t>(std::cos(theta));
+   const float32_t st             = static_cast<float32_t>(std::sin(theta));
+   const float32_t ct             = static_cast<float32_t>(std::cos(theta));
    constexpr float64_t twoPiOver3 = 2.0 * numbers::pi<float64_t> / 3.0;
-   verts[0] = float32_t3(st, 0.0f, ct);
-   verts[1] = float32_t3(static_cast<float>(st * std::cos(twoPiOver3)),
+   verts[0]                       = float32_t3(st, 0.0f, ct);
+   verts[1]                       = float32_t3(static_cast<float>(st * std::cos(twoPiOver3)),
       static_cast<float>(st * std::sin(twoPiOver3)), ct);
-   verts[2] = float32_t3(static_cast<float>(st * std::cos(2.0 * twoPiOver3)),
+   verts[2]                       = float32_t3(static_cast<float>(st * std::cos(2.0 * twoPiOver3)),
       static_cast<float>(st * std::sin(2.0 * twoPiOver3)), ct);
 }
 
-// Monte Carlo estimate of projected solid angle: E[abs(dot(L, normal))] * solidAngle.
-// Uses abs() to match the BSDF projected solid angle formula (which uses abs so that
-// triangles straddling the horizon contribute positively from both hemispheres).
-// Samples L uniformly from the spherical triangle.
-inline float64_t mcEstimatePSA(const nbl::hlsl::shapes::SphericalTriangle<nbl::hlsl::float32_t>& shape, nbl::hlsl::float32_t3 normal, uint32_t N, std::mt19937& rng)
+// Grid estimate of projected solid angle: mean of abs(dot(L, normal)) over a regular
+// [0,1]^2 grid, times solidAngle. Uses abs() to match the BSDF projected solid angle
+// formula (triangles/rects straddling the horizon contribute from both hemispheres).
+// `N` is the total number of samples; the grid side is ceil(sqrt(N)). Grid integration
+// is deterministic and has much lower variance than MC at the same sample count,
+// so it's a tighter ground truth for PSA-vs-formula comparisons.
+inline float64_t gridEstimatePSA(const nbl::hlsl::shapes::SphericalTriangle<nbl::hlsl::float32_t>& shape, nbl::hlsl::float32_t3 normal, uint32_t N)
 {
    using namespace nbl::hlsl;
-   auto sampler = sampling::SphericalTriangle<float32_t>::create(shape);
-   std::uniform_real_distribution<float> uDist(0.0f, 1.0f);
-   float64_t sum = 0.0;
-   for (uint32_t i = 0; i < N; i++)
+   auto sampler            = sampling::SphericalTriangle<float32_t>::create(shape);
+   const uint32_t gridSide = static_cast<uint32_t>(std::ceil(std::sqrt(static_cast<double>(N))));
+   const float invSide     = 1.0f / static_cast<float>(gridSide);
+   float64_t sum           = 0.0;
+   for (uint32_t iy = 0; iy < gridSide; iy++)
    {
-      float32_t2 u(uDist(rng), uDist(rng));
-      typename sampling::SphericalTriangle<float32_t>::cache_type cache;
-      float32_t3 L = sampler.generate(u, cache);
-      sum += static_cast<float64_t>(hlsl::abs(dot(normal, L)));
+      const float uy = (static_cast<float>(iy) + 0.5f) * invSide;
+      for (uint32_t ix = 0; ix < gridSide; ix++)
+      {
+         const float ux = (static_cast<float>(ix) + 0.5f) * invSide;
+         typename sampling::SphericalTriangle<float32_t>::cache_type cache;
+         const float32_t3 L = sampler.generate(float32_t2(ux, uy), cache);
+         sum += static_cast<float64_t>(hlsl::abs(dot(normal, L)));
+      }
    }
-   return sum / static_cast<float64_t>(N) * static_cast<float64_t>(shape.solid_angle);
+   return sum / static_cast<float64_t>(gridSide * gridSide) * static_cast<float64_t>(shape.solid_angle);
 }
 
-// Monte Carlo estimate of projected solid angle for a rectangle: E[abs(dot(L, normal))] * solidAngle.
-// Uses abs() to match the BSDF projected solid angle formula.
-// Samples uniformly from the spherical rectangle, reconstructs world-space direction.
-inline float64_t mcEstimatePSA(
+// Sampler-independent PSA reference for rectangles. Integrates the projected-solid-angle integral
+//   PSA = integral over rect surface of |cos(theta_receiver)| * |cos(theta_rect)| / d^2 dA
+// on a uniform surface grid in (s, t) in [0, extents.x] x [0, extents.y]. No sampler involved,
+// so disagreement with a sampler-derived PSA isolates the sampler / formula.
+inline float64_t surfaceGridEstimatePSA(
    const nbl::hlsl::shapes::SphericalRectangle<nbl::hlsl::float32_t>& shape,
    const nbl::hlsl::float32_t3& observer,
    const nbl::hlsl::float32_t3& normal,
-   uint32_t N, std::mt19937& rng)
+   uint32_t N)
+{
+   using namespace nbl::hlsl;
+   const float32_t3 rdir       = shape.basis[0];
+   const float32_t3 udir       = shape.basis[1];
+   const float32_t3 rectNormal = shape.basis[2];
+   const float32_t width       = shape.extents.x;
+   const float32_t height      = shape.extents.y;
+   const uint32_t gridSide     = static_cast<uint32_t>(std::ceil(std::sqrt(static_cast<double>(N))));
+   const float64_t cellArea    = static_cast<float64_t>(width) * static_cast<float64_t>(height) / static_cast<float64_t>(gridSide * gridSide);
+   float64_t sum               = 0.0;
+   for (uint32_t iy = 0; iy < gridSide; iy++)
+   {
+      const float32_t t = (static_cast<float32_t>(iy) + 0.5f) * height / static_cast<float32_t>(gridSide);
+      for (uint32_t ix = 0; ix < gridSide; ix++)
+      {
+         const float32_t s        = (static_cast<float32_t>(ix) + 0.5f) * width / static_cast<float32_t>(gridSide);
+         const float32_t3 worldPt = shape.origin + rdir * s + udir * t;
+         const float32_t3 toSurf  = worldPt - observer;
+         const float64_t d2       = static_cast<float64_t>(dot(toSurf, toSurf));
+         const float64_t d        = std::sqrt(d2);
+         const float32_t3 L       = toSurf * static_cast<float32_t>(1.0 / d);
+         const float64_t cosRx    = static_cast<float64_t>(hlsl::abs(dot(normal, L)));
+         const float64_t cosRt    = static_cast<float64_t>(hlsl::abs(dot(rectNormal, L)));
+         sum += cosRx * cosRt / d2;
+      }
+   }
+   return sum * cellArea;
+}
+
+// Grid estimate of projected solid angle for a rectangle: mean of abs(dot(L, normal))
+// over a regular [0,1]^2 grid, times solidAngle. See the triangle overload above.
+inline float64_t gridEstimatePSA(
+   const nbl::hlsl::shapes::SphericalRectangle<nbl::hlsl::float32_t>& shape,
+   const nbl::hlsl::float32_t3& observer,
+   const nbl::hlsl::float32_t3& normal,
+   uint32_t N)
 {
    using namespace nbl::hlsl;
    auto sampler = sampling::SphericalRectangle<float32_t>::create(shape, observer);
    if (sampler.solidAngle <= 0.0f || !std::isfinite(sampler.solidAngle))
       return 0.0;
 
-   std::uniform_real_distribution<float> uDist(0.0f, 1.0f);
-   float64_t sum = 0.0;
-   for (uint32_t i = 0; i < N; i++)
+   const uint32_t gridSide = static_cast<uint32_t>(std::ceil(std::sqrt(static_cast<double>(N))));
+   const float invSide     = 1.0f / static_cast<float>(gridSide);
+   float64_t sum           = 0.0;
+   for (uint32_t iy = 0; iy < gridSide; iy++)
    {
-      float32_t2 u(uDist(rng), uDist(rng));
-      typename sampling::SphericalRectangle<float32_t>::cache_type cache;
-      float32_t2 gen = sampler.generateSurfaceOffset(u, cache);
-      // Reconstruct world-space direction from rectangle offset
-      float32_t3 worldPt = shape.origin
-         + shape.basis[0] * gen.x
-         + shape.basis[1] * gen.y;
-      float32_t3 L = normalize(worldPt - observer);
-      sum += static_cast<float64_t>(hlsl::abs(dot(normal, L)));
+      const float uy = (static_cast<float>(iy) + 0.5f) * invSide;
+      for (uint32_t ix = 0; ix < gridSide; ix++)
+      {
+         const float ux = (static_cast<float>(ix) + 0.5f) * invSide;
+         typename sampling::SphericalRectangle<float32_t>::cache_type cache;
+         // `generateLocalBasisXY` returns absolute (xu, yv) on the rectangle surface; subtract r0.xy
+         // to get the offset-from-r0 that the world-space reconstruction below expects.
+         const float32_t2 absXY   = sampler.generateLocalBasisXY(float32_t2(ux, uy), cache);
+         const float32_t2 gen     = absXY - float32_t2(sampler.r0.x, sampler.r0.y);
+         const float32_t3 worldPt = shape.origin + shape.basis[0] * gen.x + shape.basis[1] * gen.y;
+         const float32_t3 L       = normalize(worldPt - observer);
+         sum += static_cast<float64_t>(hlsl::abs(dot(normal, L)));
+      }
    }
-   return sum / static_cast<float64_t>(N) * static_cast<float64_t>(sampler.solidAngle);
+   return sum / static_cast<float64_t>(gridSide * gridSide) * static_cast<float64_t>(sampler.solidAngle);
 }
 
 // Bundles seed + rng + failCount for randomized property tests.
@@ -357,14 +500,18 @@ struct SeededTestContext
    }
 };
 
-// Generic PSA vs MC comparison.
-// ConfigGen: void(std::mt19937& rng, uint32_t index, float64_t& formulaPSA, float64_t& mcPSA, InfoLogger& info)
-//   Must set formulaPSA and mcPSA for config `index`, or set both to 0 to skip.
+// Generic PSA vs grid-integration comparison.
+// ConfigGen: void(std::mt19937& rng, uint32_t index, float64_t& formulaPSA, float64_t& gridPSA, InfoLogger& info)
+//   Must set formulaPSA and gridPSA for config `index`, or set both to 0 to skip.
 //   `info` is a callable: void(nbl::system::ILogger*, nbl::system::ILogger::E_LOG_LEVEL) that logs
 //   sampler/shape details for the current config. Called on mismatch.
-// When diagnostic=true, failures log at ELL_WARNING instead of ELL_ERROR (non-hard-fail).
+// Two-tier tolerance:
+//   - (relTol, absTol): soft threshold. Exceedance counts as a mismatch. With diagnostic=true
+//     the run still returns true (known-limitation noise); with diagnostic=false it hard-fails.
+//   - (hardRelTol, hardAbsTol): egregious threshold. Always hard-fails regardless of diagnostic,
+//     so a catastrophic regression can't hide inside the warning stream.
 template<typename ConfigGen>
-inline bool testPSAVersusMonteCarlo(
+inline bool testPSAVersusGrid(
    nbl::system::ILogger* logger,
    const char* tag,
    const char* label,
@@ -372,49 +519,78 @@ inline bool testPSAVersusMonteCarlo(
    uint32_t numConfigs,
    float64_t relTol,
    float64_t absTol,
+   float64_t hardRelTol,
+   float64_t hardAbsTol,
    bool diagnostic = false)
 {
-   const auto failLevel = diagnostic ? nbl::system::ILogger::ELL_WARNING : nbl::system::ILogger::ELL_ERROR;
+   const auto softFailLevel = diagnostic ? nbl::system::ILogger::ELL_WARNING : nbl::system::ILogger::ELL_ERROR;
    SeededTestContext ctx;
+   uint32_t hardFailCount = 0;
+   uint32_t testedCount   = 0;
 
    for (uint32_t c = 0; c < numConfigs; c++)
    {
-      float64_t formulaPSA = 0.0, mcPSA = 0.0;
+      float64_t formulaPSA = 0.0, gridPSA = 0.0;
       std::function<void(nbl::system::ILogger*, nbl::system::ILogger::E_LOG_LEVEL)> logInfo =
-         [](nbl::system::ILogger*, nbl::system::ILogger::E_LOG_LEVEL) {};
-      configGenerator(ctx.rng, c, formulaPSA, mcPSA, logInfo);
+         [](nbl::system::ILogger*, nbl::system::ILogger::E_LOG_LEVEL) {
+         };
+      configGenerator(ctx.rng, c, formulaPSA, gridPSA, logInfo);
 
-      if (mcPSA == 0.0 && formulaPSA == 0.0)
+      if (gridPSA == 0.0 && formulaPSA == 0.0)
          continue;
+      testedCount++;
+
+      const float64_t absErr = std::abs(formulaPSA - gridPSA);
+      const float64_t relErr = (std::abs(gridPSA) > 1e-10) ? absErr / std::abs(gridPSA) : 0.0;
 
-      const float64_t absErr = std::abs(formulaPSA - mcPSA);
-      const float64_t relErr = (std::abs(mcPSA) > 1e-10) ? absErr / std::abs(mcPSA) : 0.0;
+      const bool softFail = relErr > relTol && absErr > absTol;
+      const bool hardFail = relErr > hardRelTol && absErr > hardAbsTol;
 
-      if (relErr > relTol && absErr > absTol)
+      if (softFail)
       {
          ctx.failCount++;
+         if (hardFail)
+            hardFailCount++;
          if (ctx.failCount <= 5)
          {
-            logger->log("  [%s] %s mismatch: formula=%f expected(MC)=%f relErr=%e absErr=%e config %u",
-               failLevel, tag, label, formulaPSA, mcPSA, relErr, absErr, c);
-            logInfo(logger, failLevel);
+            const auto level = hardFail ? nbl::system::ILogger::ELL_ERROR : softFailLevel;
+            logger->log("  [%s] %s %s: formula=%f expected(grid)=%f relErr=%e absErr=%e config %u",
+               level, tag, label, hardFail ? "HARD mismatch" : "mismatch",
+               formulaPSA, gridPSA, relErr, absErr, c);
+            logInfo(logger, level);
          }
       }
    }
 
+   const uint32_t skippedCount = numConfigs - testedCount;
+
    if (ctx.failCount == 0)
-      logger->log("  [%s] %s PASSED (%u configs, relTol=%e absTol=%e)",
-         nbl::system::ILogger::ELL_PERFORMANCE, tag, label, numConfigs, relTol, absTol);
-   else
    {
-      logger->log("  [%s] %s FAILED (%u/%u configs exceeded tolerance, relTol=%e absTol=%e)",
-         failLevel, tag, label, ctx.failCount, numConfigs, relTol, absTol);
-      if (diagnostic)
-         logger->log("  [%s] reproduce with seed=%u (diagnostic only, not a hard failure)",
-            nbl::system::ILogger::ELL_WARNING, tag, ctx.seed);
+      logger->log("  [%s] %s PASSED (%u tested, %u skipped of %u requested, relTol=%e absTol=%e)",
+         nbl::system::ILogger::ELL_PERFORMANCE, tag, label,
+         testedCount, skippedCount, numConfigs, relTol, absTol);
+      return true;
    }
 
-   return diagnostic ? true : ctx.finalize(logger, tag);
+   const bool hardFailed   = hardFailCount > 0;
+   const auto summaryLevel = hardFailed ? nbl::system::ILogger::ELL_ERROR : softFailLevel;
+   if (hardFailed)
+      logger->log("  [%s] %s FAILED (%u/%u exceeded soft tol, %u/%u exceeded HARD tol, %u skipped of %u, hardRelTol=%e hardAbsTol=%e)",
+         summaryLevel, tag, label, ctx.failCount, testedCount, hardFailCount, testedCount,
+         skippedCount, numConfigs, hardRelTol, hardAbsTol);
+   else
+      logger->log("  [%s] %s FAILED (%u/%u configs exceeded tolerance, %u skipped of %u, relTol=%e absTol=%e)",
+         summaryLevel, tag, label, ctx.failCount, testedCount, skippedCount, numConfigs, relTol, absTol);
+
+   const bool shouldHardFail = hardFailed || !diagnostic;
+   if (shouldHardFail)
+      logger->log("  [%s] reproduce with seed=%u",
+         nbl::system::ILogger::ELL_ERROR, tag, ctx.seed);
+   else
+      logger->log("  [%s] reproduce with seed=%u (diagnostic only, not a hard failure)",
+         nbl::system::ILogger::ELL_WARNING, tag, ctx.seed);
+
+   return !shouldHardFail;
 }
 
 // ============================================================================
@@ -435,23 +611,21 @@ inline void generateRandomRectangle(std::mt19937& rng,
    float32_t3 t1, t2;
    buildTangentFrame(normal, t1, t2);
 
-   const float width = sizeDist(rng);
+   const float width  = sizeDist(rng);
    const float height = sizeDist(rng);
-   const float dist = distDist(rng);
+   const float dist   = distDist(rng);
 
-   observer = float32_t3(offsetDist(rng), offsetDist(rng), offsetDist(rng));
+   observer          = float32_t3(offsetDist(rng), offsetDist(rng), offsetDist(rng));
    compressed.origin = observer - normal * dist + t1 * offsetDist(rng) + t2 * offsetDist(rng);
-   compressed.right = t1 * width;
-   compressed.up = t2 * height;
+   compressed.right  = t1 * width;
+   compressed.up     = t2 * height;
 }
 
 // Stress rectangles: ill-conditioned geometries that exercise edge cases.
 //  - Extreme aspect ratio (10:1 to 20:1)
 //  - Grazing angle (observer nearly in the rectangle plane)
 //  - Observer near corner (most of the rectangle off to one side)
-inline void generateStressRectangle(std::mt19937& rng,
-   nbl::hlsl::shapes::CompressedSphericalRectangle<nbl::hlsl::float32_t>& compressed,
-   nbl::hlsl::float32_t3& observer)
+inline void generateStressRectangle(std::mt19937& rng, nbl::hlsl::shapes::CompressedSphericalRectangle<nbl::hlsl::float32_t>& compressed, nbl::hlsl::float32_t3& observer)
 {
    using namespace nbl::hlsl;
    std::uniform_real_distribution<float> uDist(0.0f, 1.0f);
@@ -464,39 +638,39 @@ inline void generateStressRectangle(std::mt19937& rng,
    switch (caseDist(rng))
    {
       case 0: // Extreme aspect ratio
-      {
-         const float longSide = 3.0f + uDist(rng) * 5.0f;
-         const float shortSide = 0.1f + uDist(rng) * 0.2f;
-         const float dist = 1.5f + uDist(rng) * 2.0f;
-         observer = float32_t3(0.0f, 0.0f, 0.0f);
-         compressed.origin = -normal * dist - t1 * (longSide * 0.5f) - t2 * (shortSide * 0.5f);
-         compressed.right = t1 * longSide;
-         compressed.up = t2 * shortSide;
-         break;
-      }
+         {
+            const float longSide  = 3.0f + uDist(rng) * 5.0f;
+            const float shortSide = 0.1f + uDist(rng) * 0.2f;
+            const float dist      = 1.5f + uDist(rng) * 2.0f;
+            observer              = float32_t3(0.0f, 0.0f, 0.0f);
+            compressed.origin     = -normal * dist - t1 * (longSide * 0.5f) - t2 * (shortSide * 0.5f);
+            compressed.right      = t1 * longSide;
+            compressed.up         = t2 * shortSide;
+            break;
+         }
       case 1: // Grazing angle (observer nearly in the rectangle plane)
-      {
-         const float width = 1.0f + uDist(rng) * 2.0f;
-         const float height = 1.0f + uDist(rng) * 2.0f;
-         const float normalDist = 0.05f + uDist(rng) * 0.15f;
-         const float tangentOffset = 0.5f + uDist(rng) * 1.0f;
-         observer = float32_t3(0.0f, 0.0f, 0.0f);
-         compressed.origin = -normal * normalDist + t1 * tangentOffset - t2 * (height * 0.5f);
-         compressed.right = t1 * width;
-         compressed.up = t2 * height;
-         break;
-      }
+         {
+            const float width         = 1.0f + uDist(rng) * 2.0f;
+            const float height        = 1.0f + uDist(rng) * 2.0f;
+            const float normalDist    = 0.05f + uDist(rng) * 0.15f;
+            const float tangentOffset = 0.5f + uDist(rng) * 1.0f;
+            observer                  = float32_t3(0.0f, 0.0f, 0.0f);
+            compressed.origin         = -normal * normalDist + t1 * tangentOffset - t2 * (height * 0.5f);
+            compressed.right          = t1 * width;
+            compressed.up             = t2 * height;
+            break;
+         }
       default: // Observer near corner
-      {
-         const float width = 2.0f + uDist(rng) * 3.0f;
-         const float height = 2.0f + uDist(rng) * 3.0f;
-         const float dist = 0.5f + uDist(rng) * 1.0f;
-         observer = float32_t3(0.0f, 0.0f, 0.0f);
-         compressed.origin = -normal * dist - t1 * (0.05f + uDist(rng) * 0.1f) - t2 * (0.05f + uDist(rng) * 0.1f);
-         compressed.right = t1 * width;
-         compressed.up = t2 * height;
-         break;
-      }
+         {
+            const float width  = 2.0f + uDist(rng) * 3.0f;
+            const float height = 2.0f + uDist(rng) * 3.0f;
+            const float dist   = 0.5f + uDist(rng) * 1.0f;
+            observer           = float32_t3(0.0f, 0.0f, 0.0f);
+            compressed.origin  = -normal * dist - t1 * (0.05f + uDist(rng) * 0.1f) - t2 * (0.05f + uDist(rng) * 0.1f);
+            compressed.right   = t1 * width;
+            compressed.up      = t2 * height;
+            break;
+         }
    }
 }
 
@@ -590,10 +764,10 @@ inline void logRectInfo(
 {
    using namespace nbl::system;
    using namespace nbl::hlsl;
-   const float width = length(compressed.right);
-   const float height = length(compressed.up);
+   const float width       = length(compressed.right);
+   const float height      = length(compressed.up);
    const float32_t3 normal = normalize(cross(compressed.right, compressed.up));
-   const float dist = length(compressed.origin - observer);
+   const float dist        = length(compressed.origin - observer);
    logger->log("    origin=%s right=%s up=%s observer=%s",
       ILogger::ELL_ERROR,
       to_string(compressed.origin).c_str(),
@@ -617,14 +791,14 @@ inline bool anyRectCornerAboveHorizon(
    const nbl::hlsl::float32_t3& normal)
 {
    using namespace nbl::hlsl;
-   const float32_t3 r0 = mul(shape.basis, shape.origin - observer);
+   const float32_t3 r0     = mul(shape.basis, shape.origin - observer);
    const float32_t3 localN = mul(shape.basis, normal);
-   const float32_t3 v0 = normalize(r0);
-   const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f));
-   const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f));
-   const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f));
+   const float32_t3 v0     = normalize(r0);
+   const float32_t3 v1     = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f));
+   const float32_t3 v2     = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f));
+   const float32_t3 v3     = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f));
    return dot(localN, v0) > 0.0f || dot(localN, v1) > 0.0f ||
-          dot(localN, v2) > 0.0f || dot(localN, v3) > 0.0f;
+      dot(localN, v2) > 0.0f || dot(localN, v3) > 0.0f;
 }
 
 // True if all rectangle corners have positive NdotL with the given normal.
@@ -635,14 +809,14 @@ inline bool allRectCornersAboveHorizon(
    const nbl::hlsl::float32_t3& normal)
 {
    using namespace nbl::hlsl;
-   const float32_t3 r0 = mul(shape.basis, shape.origin - observer);
+   const float32_t3 r0     = mul(shape.basis, shape.origin - observer);
    const float32_t3 localN = mul(shape.basis, normal);
-   const float32_t3 v0 = normalize(r0);
-   const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f));
-   const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f));
-   const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f));
+   const float32_t3 v0     = normalize(r0);
+   const float32_t3 v1     = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f));
+   const float32_t3 v2     = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f));
+   const float32_t3 v3     = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f));
    return dot(localN, v0) > 0.0f && dot(localN, v1) > 0.0f &&
-          dot(localN, v2) > 0.0f && dot(localN, v3) > 0.0f;
+      dot(localN, v2) > 0.0f && dot(localN, v3) > 0.0f;
 }
 
 #endif
diff --git a/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h b/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h
index cb28b63fc..ecb0f606d 100644
--- a/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h
+++ b/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h
@@ -414,6 +414,12 @@ class CSphericalTriangleGenerateTester
 
          auto sampler = sampling::SphericalTriangle<float32_t>::create(shape);
          const float64_t SA = static_cast<float64_t>(shape.solid_angle);
+         // Float32 solid angle (acos sum - pi) loses precision for small
+         // triangles due to catastrophic cancellation, making the expected
+         // sub-solid-angle ratio unreliable as a reference value.
+         // At SA ~ 0.003, the relative error in float32 solid angles reaches
+         // ~1-3%, comparable to the half-space counting tolerance.
+         const bool tinyTriangle = SA < 4e-3;
 
          // For each cut: pick a vertex and a point on the opposite edge,
          // forming a great circle that splits the triangle in two.
@@ -482,12 +488,20 @@ class CSphericalTriangleGenerateTester
             testedCuts++;
             if (absErr > relTol)
             {
-               ctx.failCount++;
-               if (ctx.failCount <= 5)
+               if (tinyTriangle)
                {
-                  m_logger->log("[SphericalTriangle::generate] %s half-space: observed=%f expected=%f absErr=%e (tol=%e) tri %u cut %u",
-                     system::ILogger::ELL_ERROR, label, observedFraction, expectedFraction, absErr, relTol, t, c);
-                  logTriangleInfo(m_logger, v0, v1, v2);
+                  m_logger->log("[SphericalTriangle::generate] %s half-space: observed=%f expected=%f absErr=%e (tol=%e) tri %u cut %u -- solid angle %e too small for float32, especially on GPU",
+                     system::ILogger::ELL_WARNING, label, observedFraction, expectedFraction, absErr, relTol, t, c, SA);
+               }
+               else
+               {
+                  ctx.failCount++;
+                  if (ctx.failCount <= 5)
+                  {
+                     m_logger->log("[SphericalTriangle::generate] %s half-space: observed=%f expected=%f absErr=%e (tol=%e) tri %u cut %u",
+                        system::ILogger::ELL_ERROR, label, observedFraction, expectedFraction, absErr, relTol, t, c);
+                     logTriangleInfo(m_logger, v0, v1, v2);
+                  }
                }
             }
          }
@@ -504,12 +518,20 @@ class CSphericalTriangleGenerateTester
    }
 
    // -------------------------------------------------------------------------
-   // Moment matching: E[dot(generate(u), N)] should equal PSA(N) / SA.
+   // Moment matching: E[dot(generate(u), N)] should equal signedPSA(N) / SA.
    //
    // For a uniform distribution over a spherical triangle:
    //   E[f(L)] = (1/SA) * integral_triangle f(L) dw
    //
-   // Choosing f(L) = dot(L, N) gives E[dot(L, N)] = PSA(N) / SA.
+   // Choosing f(L) = dot(L, N) gives E[dot(L, N)] = signedPSA(N) / SA,
+   // where signedPSA is the exact signed projected solid angle computed
+   // via the Kelvin-Stokes theorem:
+   //   signedPSA(N) = 0.5 * sum_edges dot(edgeNormal_i, N) * edgeArcLength_i
+   //
+   // Note: shapes::SphericalTriangle::projectedSolidAngle() returns a signed result
+   // (Kelvin-Stokes signed sum); tests abs() the return to compare against the
+   // |cos(theta)| (BSDF) PSA integral reference.
+   //
    // If generate() has a systematic bias (e.g., concentrating samples
    // near one vertex), this moment will be wrong for most directions N.
    // Testing multiple random N per triangle makes it very unlikely that
@@ -533,11 +555,34 @@ class CSphericalTriangleGenerateTester
          auto sampler = sampling::SphericalTriangle<float32_t>::create(shape);
          const float64_t SA = static_cast<float64_t>(shape.solid_angle);
 
+         // Precompute edge normals and arc lengths for the signed PSA formula.
+         // cross(v_j, v_k) * csc_sides[i] gives outward-pointing edge normals
+         // only when the vertices are CCW as seen from outside the sphere.
+         // The sign of the triple product dot(v0, cross(v1, v2)) tells us the
+         // winding: positive = CCW (outward normals), negative = CW (inward).
+         const float32_t3 crossBC = hlsl::cross(shape.vertices[1], shape.vertices[2]);
+         const float64_t windingSign = (hlsl::dot(shape.vertices[0], crossBC) >= 0.0f) ? 1.0 : -1.0;
+         const float32_t3 edgeNormals[3] = {
+            crossBC * shape.csc_sides[0],
+            hlsl::cross(shape.vertices[2], shape.vertices[0]) * shape.csc_sides[1],
+            hlsl::cross(shape.vertices[0], shape.vertices[1]) * shape.csc_sides[2]
+         };
+         const float64_t edgeAngles[3] = {
+            std::acos(static_cast<float64_t>(hlsl::clamp(shape.cos_sides[0], -1.0f, 1.0f))),
+            std::acos(static_cast<float64_t>(hlsl::clamp(shape.cos_sides[1], -1.0f, 1.0f))),
+            std::acos(static_cast<float64_t>(hlsl::clamp(shape.cos_sides[2], -1.0f, 1.0f)))
+         };
+
          for (uint32_t n = 0; n < numNormals; n++)
          {
             float32_t3 N = generateRandomUnitVector(ctx.rng);
-            const float64_t psa = static_cast<float64_t>(shape.projectedSolidAngle(N));
-            const float64_t expected = psa / SA;
+
+            // Signed PSA via Kelvin-Stokes: exact for integral dot(L,N) dOmega
+            float64_t signedPSA = 0.0;
+            for (uint32_t e = 0; e < 3; e++)
+               signedPSA += static_cast<float64_t>(hlsl::dot(edgeNormals[e], N)) * edgeAngles[e];
+            signedPSA *= 0.5 * windingSign;
+            const float64_t expected = signedPSA / SA;
 
             float64_t sum = 0.0;
             std::uniform_real_distribution<float> uDist(0.0f, 1.0f);
@@ -546,7 +591,7 @@ class CSphericalTriangleGenerateTester
                float32_t2 u(uDist(ctx.rng), uDist(ctx.rng));
                typename sampling::SphericalTriangle<float32_t>::cache_type cache;
                float32_t3 L = sampler.generate(u, cache);
-               sum += static_cast<float64_t>(hlsl::abs(dot(L, N)));
+               sum += static_cast<float64_t>(dot(L, N));
             }
             const float64_t mcEstimate = sum / static_cast<float64_t>(numSamples);
 
@@ -601,7 +646,7 @@ class CSphericalTriangleGenerateTester
          if (shape.solid_angle <= 0.0f || !std::isfinite(shape.solid_angle))
             continue;
 
-         auto sampler = sampling::SphericalTriangle<float32_t, true>::create(shape);
+         auto sampler = sampling::SphericalTriangle<float32_t>::create(shape);
          std::uniform_real_distribution<float> uDist(0.0f, 1.0f);
 
          for (uint32_t i = 0; i < samplesPerTriangle; i++)
@@ -742,7 +787,7 @@ class CSphericalTriangleGenerateTester
 // Tests two aspects of projected spherical triangles:
 //
 // 1. PSA formula accuracy: shapes::SphericalTriangle::projectedSolidAngle
-//    against Monte Carlo ground truth (PSA = integral_{tri} abs(dot(L,N)) dOmega).
+//    against grid-integration ground truth (PSA = integral_{tri} abs(dot(L,N)) dOmega).
 //
 // 2. PST sampler accuracy: how well ProjectedSphericalTriangle's bilinear
 //    importance sampling approximates the true NdotL distribution, and
@@ -767,18 +812,21 @@ class CProjectedSphericalTriangleGeometricTester
       // when edge normals have mixed signs, even when all vertices are above the horizon.
       // These tests are diagnostic-only until proper hemisphere clipping is implemented.
       // TODO: make these hard failures once projectedSolidAngle clips to the hemisphere.
-      testPSAVersusMonteCarlo("random MC", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal)
+      // Hard-fail thresholds: relErr > 3.0 AND absErr > 0.3 means the formula is catastrophically
+      // wrong, not just affected by the known abs()-overcount limitation. Catches regressions that
+      // would otherwise hide in the warning stream.
+      pass &= testPSAVersusGrid("random", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal)
          {
          generateRandomTriangleVertices(rng, v0, v1, v2);
-         normal = generateRandomUnitVector(rng); }, 200, 500000, 0.05, 0.01, true);
-      testPSAVersusMonteCarlo("grazing MC", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal)
+         normal = generateRandomUnitVector(rng); }, 200, 500000, 0.05, 0.01, 3.0, 0.3, true);
+      pass &= testPSAVersusGrid("grazing", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal)
          {
          generateRandomTriangleVertices(rng, v0, v1, v2);
          float32_t3 triCenter = normalize(v0 + v1 + v2);
          float32_t3 tangent, unused;
          buildTangentFrame(triCenter, tangent, unused);
          std::uniform_real_distribution<float> grazeDist(0.02f, 0.15f);
-         normal = normalize(tangent + triCenter * grazeDist(rng)); }, 200, 500000, 0.1, 0.01, true);
+         normal = normalize(tangent + triCenter * grazeDist(rng)); }, 200, 500000, 0.1, 0.01, 3.0, 0.3, true);
       // Also diagnostic -- same abs() issue affects small triangles
       testPSASmallTriangle();
 
@@ -860,7 +908,7 @@ class CProjectedSphericalTriangleGeometricTester
    // Known analytic cases
    bool testPSAKnownCases()
    {
-      constexpr float64_t psaOctantMCRelTol = 0.05;
+      constexpr float64_t psaOctantGridRelTol = 0.05;
       constexpr float64_t psaSymmetryRelTol = 1e-4;
 
       SeededTestContext ctx;
@@ -872,51 +920,52 @@ class CProjectedSphericalTriangleGeometricTester
       // By Kelvin-Stokes / direct integration, PSA = pi/4 for any axis-aligned normal.
       {
          auto shape = createSphericalTriangleShape(float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1));
-         const float64_t psaZ = static_cast<float64_t>(shape.projectedSolidAngle(float32_t3(0, 0, 1)));
+         const float64_t psaZ = std::abs(static_cast<float64_t>(shape.projectedSolidAngle(float32_t3(0, 0, 1))));
 
-         // MC verification: sample many points uniformly from the octant triangle
-         const float64_t mcPSA = mcEstimatePSA(shape, float32_t3(0, 0, 1), 1000000, ctx.rng);
+         // Grid verification: evaluate abs(N.L) over a dense grid on the octant triangle
+         const float64_t gridPSA = gridEstimatePSA(shape, float32_t3(0, 0, 1), 1000000);
 
-         const float64_t formulaVsMC = std::abs(psaZ - mcPSA) / std::abs(mcPSA);
-         m_logger->log("  [PSA] octant z-normal: formula=%f expected(pi/4)=%f reference=%f relErr=%e",
-            system::ILogger::ELL_PERFORMANCE, psaZ, nbl::hlsl::numbers::pi<float64_t> / 4.0, mcPSA, formulaVsMC);
+         const float64_t formulaVsGrid = std::abs(psaZ - gridPSA) / std::abs(gridPSA);
+         m_logger->log("  [TriPSA] octant z-normal: formula=%f expected(pi/4)=%f reference=%f relErr=%e",
+            system::ILogger::ELL_PERFORMANCE, psaZ, nbl::hlsl::numbers::pi<float64_t> / 4.0, gridPSA, formulaVsGrid);
 
-         if (formulaVsMC > psaOctantMCRelTol)
+         if (formulaVsGrid > psaOctantGridRelTol)
          {
-            m_logger->log("  [PSA] octant z-normal FAILED: formula=%f expected(reference)=%f relErr=%e relTol=%e",
-               system::ILogger::ELL_ERROR, psaZ, mcPSA, formulaVsMC, psaOctantMCRelTol);
+            m_logger->log("  [TriPSA] octant z-normal FAILED: formula=%f expected(reference)=%f relErr=%e relTol=%e",
+               system::ILogger::ELL_ERROR, psaZ, gridPSA, formulaVsGrid, psaOctantGridRelTol);
             pass = false;
          }
 
          // Same octant, normal = (1,0,0): by symmetry same result as z-normal
-         const float64_t psaX = static_cast<float64_t>(shape.projectedSolidAngle(float32_t3(1, 0, 0)));
+         const float64_t psaX = std::abs(static_cast<float64_t>(shape.projectedSolidAngle(float32_t3(1, 0, 0))));
          const float64_t relDiff = std::abs(psaZ - psaX) / std::max(psaZ, psaX);
 
-         m_logger->log("  [PSA] octant symmetry: psaZ=%f psaX=%f relDiff=%e",
+         m_logger->log("  [TriPSA] octant symmetry: psaZ=%f psaX=%f relDiff=%e",
             system::ILogger::ELL_PERFORMANCE, psaZ, psaX, relDiff);
 
          if (relDiff > psaSymmetryRelTol)
          {
-            m_logger->log("  [PSA] octant symmetry FAILED: psaZ=%f psaX=%f relDiff=%e relTol=%e",
+            m_logger->log("  [TriPSA] octant symmetry FAILED: psaZ=%f psaX=%f relDiff=%e relTol=%e",
                system::ILogger::ELL_ERROR, psaZ, psaX, relDiff, psaSymmetryRelTol);
             pass = false;
          }
       }
 
       if (pass)
-         m_logger->log("  [PSA] known cases PASSED (octant z-normal vs MC relTol=%e, octant symmetry z vs x relTol=%e)",
-            system::ILogger::ELL_PERFORMANCE, psaOctantMCRelTol, psaSymmetryRelTol);
+         m_logger->log("  [TriPSA] known cases PASSED (octant z-normal vs grid relTol=%e, octant symmetry z vs x relTol=%e)",
+            system::ILogger::ELL_PERFORMANCE, psaOctantGridRelTol, psaSymmetryRelTol);
 
-      return ctx.finalize(pass, m_logger, "PSA");
+      return ctx.finalize(pass, m_logger, "TriPSA");
    }
 
-   // Helper: run MC comparison of formulaPSA vs E[dot(L,N)]*SA for a set of triangle configs.
+   // Helper: run grid-integration comparison of formulaPSA vs PSA reference for a set of triangle configs.
    // TriConfigGen: void(rng, index, v0, v1, v2, normal) — generates triangle vertices + normal.
    template<typename TriConfigGen>
-   bool testPSAVersusMonteCarlo(const char* label, TriConfigGen triConfigGenerator, uint32_t numConfigs, uint32_t mcSamples, float64_t relTol, float64_t absTol, bool diagnostic = false)
+   bool testPSAVersusGrid(const char* label, TriConfigGen triConfigGenerator, uint32_t numConfigs, uint32_t gridSamples,
+      float64_t relTol, float64_t absTol, float64_t hardRelTol, float64_t hardAbsTol, bool diagnostic = false)
    {
-      return ::testPSAVersusMonteCarlo(m_logger, "PSA", label,
-         [&](std::mt19937& rng, uint32_t c, float64_t& formulaPSA, float64_t& mcPSA, auto& logInfo)
+      return ::testPSAVersusGrid(m_logger, "TriPSA", label,
+         [&](std::mt19937& rng, uint32_t c, float64_t& formulaPSA, float64_t& gridPSA, auto& logInfo)
          {
             float32_t3 v0, v1, v2, normal;
             triConfigGenerator(rng, c, v0, v1, v2, normal);
@@ -925,8 +974,8 @@ class CProjectedSphericalTriangleGeometricTester
             if (shape.solid_angle <= 0.0f || !std::isfinite(shape.solid_angle))
                return;
 
-            formulaPSA = static_cast<float64_t>(shape.projectedSolidAngle(normal));
-            mcPSA = mcEstimatePSA(shape, normal, mcSamples, rng);
+            formulaPSA = std::abs(static_cast<float64_t>(shape.projectedSolidAngle(normal)));
+            gridPSA = gridEstimatePSA(shape, normal, gridSamples);
             logInfo = [=](system::ILogger* logger, system::ILogger::E_LOG_LEVEL level)
             {
                using nbl::system::to_string;
@@ -935,14 +984,14 @@ class CProjectedSphericalTriangleGeometricTester
                   to_string(normal).c_str(), to_string(shape.solid_angle).c_str());
             };
          },
-         numConfigs, relTol, absTol, diagnostic);
+         numConfigs, relTol, absTol, hardRelTol, hardAbsTol, diagnostic);
    }
 
-   // Small triangles -- PSA should approach MC ground truth
+   // Small triangles -- PSA should approach grid ground truth
    bool testPSASmallTriangle()
    {
       constexpr float64_t smallTriMeanRelErrTol = 0.1;
-      constexpr uint32_t smallTriMCSamples = 100000;
+      constexpr uint32_t smallTriGridSamples = 100000;
 
       SeededTestContext ctx;
       bool pass = true;
@@ -973,27 +1022,27 @@ class CProjectedSphericalTriangleGeometricTester
             if (shape.solid_angle <= 0.0f || !std::isfinite(shape.solid_angle))
                continue;
 
-            const float64_t formulaPSA = static_cast<float64_t>(shape.projectedSolidAngle(normal));
+            const float64_t formulaPSA = std::abs(static_cast<float64_t>(shape.projectedSolidAngle(normal)));
             const float64_t sa = static_cast<float64_t>(shape.solid_angle);
             const float64_t centerNdotL = static_cast<float64_t>(dot(normal, baseDir));
 
             if (std::abs(centerNdotL) < 0.1 || sa < 1e-10)
                continue;
 
-            // MC ground truth: E[abs(dot(L, N))] * solidAngle
-            const float64_t mcPSA = mcEstimatePSA(shape, normal, smallTriMCSamples, ctx.rng);
+            // Grid ground truth: mean over regular [0,1]^2 grid of abs(dot(L, N)) * solidAngle
+            const float64_t gridPSA = gridEstimatePSA(shape, normal, smallTriGridSamples);
 
-            if (std::abs(mcPSA) < 1e-10)
+            if (std::abs(gridPSA) < 1e-10)
                continue;
 
-            const float64_t relErr = (formulaPSA - mcPSA) / mcPSA;
+            const float64_t relErr = (formulaPSA - gridPSA) / gridPSA;
 
             sumRelErrPerSize[s] += relErr;
             validTrials[s]++;
          }
       }
 
-      m_logger->log("  [PSA] small triangle PSA vs MC (signed relErr, positive=overestimate):", system::ILogger::ELL_PERFORMANCE);
+      m_logger->log("  [TriPSA] small triangle PSA vs grid (signed relErr, positive=overestimate):", system::ILogger::ELL_PERFORMANCE);
       for (uint32_t s = 0; s < numSizes; s++)
       {
          if (validTrials[s] > 0)
@@ -1005,14 +1054,14 @@ class CProjectedSphericalTriangleGeometricTester
             // Skip halfAngle=0.01 (s==5): float32 solid angle precision collapses
             if (s == 4 && std::abs(meanRelErr) > smallTriMeanRelErrTol)
             {
-               m_logger->log("  [PSA] small triangle exceeded tolerance at halfAngle=%.3f meanRelErr=%+e meanRelErrTol=%e (%u trials)",
+               m_logger->log("  [TriPSA] small triangle exceeded tolerance at halfAngle=%.3f meanRelErr=%+e meanRelErrTol=%e (%u trials)",
                   system::ILogger::ELL_WARNING, halfAngles[s], meanRelErr, smallTriMeanRelErrTol, validTrials[s]);
             }
          }
       }
 
-      m_logger->log("  [PSA] small triangle test complete (%u trials across %u sizes, %u MC samples each, meanRelErrTol=%e) -- diagnostic only",
-         system::ILogger::ELL_PERFORMANCE, numTrials, numSizes, smallTriMCSamples, smallTriMeanRelErrTol);
+      m_logger->log("  [TriPSA] small triangle test complete (%u trials across %u sizes, %u grid samples each, meanRelErrTol=%e) -- diagnostic only",
+         system::ILogger::ELL_PERFORMANCE, numTrials, numSizes, smallTriGridSamples, smallTriMeanRelErrTol);
 
       return true; // diagnostic only -- abs()-based PSA overestimates, not a hard failure
    }
@@ -1076,7 +1125,7 @@ class CProjectedSphericalTriangleGeometricTester
          if (!std::isfinite(sampler.sphtri.rcpSolidAngle) || sampler.sphtri.rcpSolidAngle <= 0.0f)
             continue;
 
-         const float64_t projSA = static_cast<float64_t>(shape.projectedSolidAngle(cfg.normal));
+         const float64_t projSA = std::abs(static_cast<float64_t>(shape.projectedSolidAngle(cfg.normal)));
          const bool hasPSA = projSA > 0.0 && std::isfinite(projSA);
          const float64_t rcpPSA = hasPSA ? 1.0 / projSA : 0.0;
          MISStats& mis = isGrazing ? grazingMIS : normalMIS;
@@ -1090,7 +1139,7 @@ class CProjectedSphericalTriangleGeometricTester
             float32_t3 L = sampler.generate(u, cache);
 
             const float64_t trueNdotL = std::max(0.0, static_cast<float64_t>(dot(cfg.normal, L)));
-            const float64_t bilinearNdotL = static_cast<float64_t>(cache.abs_cos_theta);
+            const float64_t bilinearNdotL = std::numeric_limits<float64_t>::quiet_NaN();
             const float64_t pstPdf = static_cast<float64_t>(sampler.forwardPdf(u, cache));
 
             // Bilinear vs true NdotL
@@ -1323,7 +1372,7 @@ class CProjectedSphericalTriangleGeometricTester
                continue;
 
             auto sampler = createSampler(cfg);
-            const float64_t projSA = static_cast<float64_t>(shape.projectedSolidAngle(cfg.normal));
+            const float64_t projSA = std::abs(static_cast<float64_t>(shape.projectedSolidAngle(cfg.normal)));
 
             if (projSA <= 0.0 || !std::isfinite(projSA) ||
                !std::isfinite(sampler.sphtri.rcpSolidAngle) || sampler.sphtri.rcpSolidAngle <= 0.0f)
@@ -1344,7 +1393,11 @@ class CProjectedSphericalTriangleGeometricTester
                if (trueNdotL < 1e-6)
                   continue;
 
-               const float64_t pstPdf = static_cast<float64_t>(sampler.backwardPdf(L));
+               // No direct backwardPdf; evaluate forwardPdf at the inverted u to recover pdf(L).
+               const float32_t2 uInv = sampler.sphtri.generateInverse(L);
+               typename sampling::ProjectedSphericalTriangle<float32_t>::cache_type pdfCache;
+               sampler.generate(uInv, pdfCache);
+               const float64_t pstPdf = static_cast<float64_t>(sampler.forwardPdf(uInv, pdfCache));
                const float64_t idealPdf = trueNdotL * rcpPSA;
 
                if (!std::isfinite(pstPdf) || pstPdf <= 0.0 || idealPdf <= 0.0)
@@ -1416,6 +1469,15 @@ struct UniformRectSamplerPolicy
       return sampler_type::create(shape, observer);
    }
 
+   // Returns offset-from-r0 on the rectangle surface. Goes through generateLocalBasisXY
+   // (absolute xy) and subtracts r0.xy so the [0, extents] bounds check still applies.
+   static float32_t2 generateOffset(sampler_type& s, const float32_t2& u)
+   {
+      typename sampler_type::cache_type cache;
+      const float32_t2 absXY = s.generateLocalBasisXY(u, cache);
+      return absXY - float32_t2(s.r0.x, s.r0.y);
+   }
+
    static float getSolidAngle(const sampler_type& s) { return s.solidAngle; }
    static const char* name() { return "SphericalRectangle"; }
 
@@ -1425,7 +1487,8 @@ struct UniformRectSamplerPolicy
 
 struct ProjectedRectSamplerPolicy
 {
-   using sampler_type = sampling::ProjectedSphericalRectangle<float32_t>;
+   // UsePdfAsWeight=false so receiverNormal and projSolidAngle are populated for diagnostic logs.
+   using sampler_type = sampling::ProjectedSphericalRectangle<float32_t, false>;
 
    static sampler_type createSampler(shapes::SphericalRectangle<float32_t>& shape,
       const float32_t3& observer, std::mt19937& rng)
@@ -1439,6 +1502,17 @@ struct ProjectedRectSamplerPolicy
       return sampler_type::create(shape, observer, receiverNormal, false);
    }
 
+   // Run u through the bilinear warp then the inner sphrect's generateLocalBasisXY, and subtract
+   // r0.xy to get offset-from-r0 on the rectangle surface.
+   static float32_t2 generateOffset(sampler_type& s, const float32_t2& u)
+   {
+      typename sampling::Bilinear<float32_t>::cache_type bc;
+      const float32_t2 warped = s.bilinearPatch.generate(u, bc);
+      typename sampling::SphericalRectangle<float32_t>::cache_type sphrectCache;
+      const float32_t2 absXY = s.sphrect.generateLocalBasisXY(warped, sphrectCache);
+      return absXY - float32_t2(s.sphrect.r0.x, s.sphrect.r0.y);
+   }
+
    static float getSolidAngle(const sampler_type& s) { return s.sphrect.solidAngle; }
    static const char* name() { return "ProjectedSphericalRectangle"; }
 
@@ -1635,8 +1709,7 @@ class CRectangleGenerateTester
             for (uint32_t i = 0; i < numSamples; i++)
             {
                float32_t2 u(uDist(ctx.rng), uDist(ctx.rng));
-               typename sampler_type::cache_type cache;
-               float32_t2 gen = sampler.generateSurfaceOffset(u, cache);
+               float32_t2 gen = Policy::generateOffset(sampler, u);
                const float coord = cutAlongX ? gen.x : gen.y;
                if (coord < cutThreshold)
                   countInSub++;
@@ -1714,8 +1787,7 @@ class CRectangleGenerateTester
             for (uint32_t i = 0; i < numSamples; i++)
             {
                float32_t2 u(uDist(ctx.rng), uDist(ctx.rng));
-               typename sampler_type::cache_type cache;
-               float32_t2 gen = sampler.generateSurfaceOffset(u, cache);
+               float32_t2 gen = Policy::generateOffset(sampler, u);
                float32_t3 dir = reconstructDirection(compressed, shape.extents, observer, gen);
                sum += static_cast<float64_t>(dot(dir, N));
             }
@@ -1778,8 +1850,7 @@ class CRectangleGenerateTester
          for (uint32_t i = 0; i < numSamples; i++)
          {
             float32_t2 u(uDist(ctx.rng), uDist(ctx.rng));
-            typename sampler_type::cache_type cache;
-            float32_t2 gen = sampler.generateSurfaceOffset(u, cache);
+            float32_t2 gen = Policy::generateOffset(sampler, u);
 
             if (gen.x < -1e-5f || gen.x > extX + 1e-5f || gen.y < -1e-5f || gen.y > extY + 1e-5f)
             {
@@ -1891,9 +1962,9 @@ using CProjectedSphericalRectangleGenerateTester = CRectangleGenerateTester<Proj
 // ============================================================================
 // CProjectedSphericalRectangleGeometricTester
 //
-// Tests the rectangle projectedSolidAngle() formula against Monte Carlo,
-// reusing the generic testPSAVersusMonteCarlo infrastructure and the
-// rectangle generators from CRectangleGenerateTester.
+// Tests the rectangle projectedSolidAngle() formula against a surface-grid reference,
+// reusing the generic testPSAVersusGrid infrastructure and the rectangle generators
+// from CRectangleGenerateTester.
 // ============================================================================
 
 class CProjectedSphericalRectangleGeometricTester
@@ -1907,19 +1978,22 @@ class CProjectedSphericalRectangleGeometricTester
       // This overcounts when edge normals have mixed signs -- same issue as the triangle PSA.
       // Diagnostic-only until proper hemisphere clipping is implemented.
       // TODO: make these hard failures once projectedSolidAngle clips to the hemisphere.
-      testPSAVersusMonteCarlo("random MC", generateRandomRectangle, 200, 500000, 0.05, 0.01);
-      testPSAVersusMonteCarlo("grazing MC", generateStressRectangle, 200, 500000, 0.1, 0.01);
-      return true;
+      // Hard-fail thresholds (relErr > 3.0 AND absErr > 0.3) still catch catastrophic regressions.
+      bool pass = true;
+      pass &= testPSAVersusGrid("random", generateRandomRectangle, 200, 500000, 0.05, 0.01, 3.0, 0.3);
+      pass &= testPSAVersusGrid("grazing", generateStressRectangle, 200, 500000, 0.1, 0.01, 3.0, 0.3);
+      return pass;
    }
 
 private:
    // Reuse rectangle generators from CRectangleGenerateTester
    using RectGen = void(*)(std::mt19937&, shapes::CompressedSphericalRectangle<float32_t>&, float32_t3&);
 
-   bool testPSAVersusMonteCarlo(const char* label, RectGen rectGen, uint32_t numConfigs, uint32_t mcSamples, float64_t relTol, float64_t absTol)
+   bool testPSAVersusGrid(const char* label, RectGen rectGen, uint32_t numConfigs, uint32_t gridSamples,
+      float64_t relTol, float64_t absTol, float64_t hardRelTol, float64_t hardAbsTol)
    {
-      return ::testPSAVersusMonteCarlo(m_logger, "RectPSA", label,
-         [&](std::mt19937& rng, uint32_t, float64_t& formulaPSA, float64_t& mcPSA, auto& logInfo)
+      return ::testPSAVersusGrid(m_logger, "RectPSA", label,
+         [&](std::mt19937& rng, uint32_t, float64_t& formulaPSA, float64_t& gridPSA, auto& logInfo)
          {
             shapes::CompressedSphericalRectangle<float32_t> compressed;
             float32_t3 observer;
@@ -1932,7 +2006,9 @@ class CProjectedSphericalRectangleGeometricTester
 
             float32_t3 normal = generateRandomUnitVector(rng);
             formulaPSA = static_cast<float64_t>(shape.projectedSolidAngle(observer, normal));
-            mcPSA = mcEstimatePSA(shape, observer, normal, mcSamples, rng);
+            // surfaceGridEstimatePSA integrates over the rectangle surface directly (no sampler in
+            // the loop), so a formula-vs-reference mismatch here isolates the PSA formula.
+            gridPSA = surfaceGridEstimatePSA(shape, observer, normal, gridSamples);
             logInfo = [compressed, observer, normal, saValue = sa.value](system::ILogger* logger, system::ILogger::E_LOG_LEVEL level)
             {
                using nbl::system::to_string;
@@ -1945,7 +2021,7 @@ class CProjectedSphericalRectangleGeometricTester
                   to_string(saValue).c_str());
             };
          },
-         numConfigs, relTol, absTol, true);
+         numConfigs, relTol, absTol, hardRelTol, hardAbsTol, true);
    }
 
    system::ILogger* m_logger;

From a4559b941a9d0f465ccc8687630077e045829403 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Fri, 24 Apr 2026 21:22:28 +0300
Subject: [PATCH 4/5] alias table is packed, 2 versions, consolidated
 WORKGROUP_SIZE for tests and benchmarks, example 37 and 64 now use a single
 command buffer for benchmarks

---
 37_HLSLSamplingTests/CMakeLists.txt           |  31 +-
 .../app_resources/common/alias_table.hlsl     | 102 ++-
 .../common/discrete_sampler_bench.hlsl        |  14 +-
 .../common/spherical_triangle.hlsl            |   2 +-
 .../shaders/alias_table_test.comp.hlsl        |  74 ---
 .../shaders/bilinear_test.comp.hlsl           |   4 -
 .../box_muller_transform_test.comp.hlsl       |   4 -
 .../shaders/concentric_mapping_test.comp.hlsl |   4 -
 .../cumulative_probability_test.comp.hlsl     |  15 +-
 .../shaders/linear_test.comp.hlsl             |   4 -
 .../shaders/packed_alias_test.comp.hlsl       | 114 ++++
 .../shaders/polar_mapping_test.comp.hlsl      |   4 -
 .../projected_hemisphere_test.comp.hlsl       |   4 -
 .../shaders/projected_sphere_test.comp.hlsl   |   4 -
 ...ojected_spherical_rectangle_test.comp.hlsl |   6 +-
 ...rojected_spherical_triangle_test.comp.hlsl |   4 -
 .../spherical_rectangle_test.comp.hlsl        |   6 +-
 .../shaders/spherical_triangle.comp.hlsl      |   4 +-
 .../shaders/test_compile.comp.hlsl            |  10 -
 .../shaders/uniform_hemisphere_test.comp.hlsl |   4 -
 .../shaders/uniform_sphere_test.comp.hlsl     |   4 -
 .../benchmarks/CDiscreteSamplerBenchmark.h    | 386 +++++------
 .../benchmarks/CSamplerBenchmark.h            | 120 ++--
 37_HLSLSamplingTests/main.cpp                 | 189 +++---
 .../tests/CAliasTableGPUTester.h              |  52 +-
 37_HLSLSamplingTests/tests/CBilinearTester.h  |   2 +-
 .../tests/CBoxMullerTransformTester.h         |   2 +-
 .../tests/CConcentricMappingTester.h          |   2 +-
 .../tests/CCumulativeProbabilityGPUTester.h   |   2 +-
 .../tests/CDiscreteTableTester.h              | 608 +++++++++++-------
 37_HLSLSamplingTests/tests/CLinearTester.h    |   2 +-
 .../tests/CPolarMappingTester.h               |   2 +-
 .../tests/CProjectedHemisphereTester.h        |   2 +-
 .../tests/CProjectedSphereTester.h            |   2 +-
 .../CProjectedSphericalRectangleTester.h      |   2 +-
 .../tests/CProjectedSphericalTriangleTester.h |   2 +-
 .../tests/CSphericalRectangleTester.h         |   2 +-
 .../tests/CSphericalTriangleTester.h          |   2 +-
 .../tests/CUniformHemisphereTester.h          |   2 +-
 .../tests/CUniformSphereTester.h              |   2 +-
 64_EmulatedFloatTest/main.cpp                 | 123 +---
 41 files changed, 1031 insertions(+), 893 deletions(-)
 delete mode 100644 37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl
 create mode 100644 37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl

diff --git a/37_HLSLSamplingTests/CMakeLists.txt b/37_HLSLSamplingTests/CMakeLists.txt
index 12cbb5bb1..e50fe4663 100644
--- a/37_HLSLSamplingTests/CMakeLists.txt
+++ b/37_HLSLSamplingTests/CMakeLists.txt
@@ -26,7 +26,7 @@ set(DEPENDS
   app_resources/shaders/projected_spherical_triangle_test.comp.hlsl
   app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl
   app_resources/shaders/spherical_rectangle_test.comp.hlsl
-  app_resources/shaders/alias_table_test.comp.hlsl
+  app_resources/shaders/packed_alias_test.comp.hlsl
   app_resources/shaders/cumulative_probability_test.comp.hlsl
   app_resources/common/linear.hlsl
   app_resources/common/uniform_hemisphere.hlsl
@@ -91,7 +91,7 @@ endif()
 
 set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
 
-set(BENCH_ITERS 2048)
+set(BENCH_ITERS 128)
 set(WORKGROUP_SIZE 64)
 
 target_compile_definitions(${EXECUTABLE_NAME} PRIVATE
@@ -99,7 +99,7 @@ target_compile_definitions(${EXECUTABLE_NAME} PRIVATE
   WORKGROUP_SIZE=${WORKGROUP_SIZE}
 )
 
-set(BENCH_OPTS "\"-DBENCH_ITERS=${BENCH_ITERS}\", \"-DWORKGROUP_SIZE=${WORKGROUP_SIZE}\"")
+set(BENCH_OPTS "\"-DBENCH_ITERS=${BENCH_ITERS}\"")
 
 set(JSON "
 [
@@ -340,14 +340,24 @@ set(JSON "
     \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\", \"-DBENCH_VARIANT_R0_EXTENTS\"]
   },
   {
-    \"INPUT\": \"app_resources/shaders/alias_table_test.comp.hlsl\",
-    \"KEY\": \"alias_table_test\"
+    \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\",
+    \"KEY\": \"packed_alias_a_test\"
   },
   {
-    \"INPUT\": \"app_resources/shaders/alias_table_test.comp.hlsl\",
-    \"KEY\": \"alias_table_bench\",
+    \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\",
+    \"KEY\": \"packed_alias_b_test\",
+    \"COMPILE_OPTIONS\": [\"-DNBL_PACKED_ALIAS_B\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\",
+    \"KEY\": \"packed_alias_a_bench\",
     \"COMPILE_OPTIONS\": [${BENCH_OPTS}]
   },
+  {
+    \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\",
+    \"KEY\": \"packed_alias_b_bench\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_PACKED_ALIAS_B\"]
+  },
   {
     \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\",
     \"KEY\": \"cumulative_probability_test\"
@@ -361,6 +371,11 @@ set(JSON "
     \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\",
     \"KEY\": \"cumulative_probability_yolo_bench\",
     \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_CUMPROB_YOLO_READS\"]
+  },
+  {
+    \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\",
+    \"KEY\": \"cumulative_probability_eytzinger_bench\",
+    \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_CUMPROB_EYTZINGER\"]
   }
 ]
 ")
@@ -370,7 +385,7 @@ NBL_CREATE_NSC_COMPILE_RULES(
   LINK_TO ${EXECUTABLE_NAME}
   BINARY_DIR ${OUTPUT_DIRECTORY}
   MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
-  COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR} -T cs_6_8
+  COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR} -T cs_6_8 -DWORKGROUP_SIZE=${WORKGROUP_SIZE}
   OUTPUT_VAR KEYS
   INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
   NAMESPACE nbl::this_example::builtin::build
diff --git a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl
index bb1ed54ef..08706408f 100644
--- a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl
@@ -8,12 +8,28 @@
 using namespace nbl::hlsl;
 
 NBL_CONSTEXPR uint32_t AliasTestTableSize = 4;
+// Log2N = ceil_log2(N) minimises quantisation drift on the stayProb unorm
+// (here 30 unorm bits, essentially lossless).
+NBL_CONSTEXPR uint32_t AliasTestLog2N     = 2;
 
-using AliasTestProbAccessor = ArrayAccessor<float32_t, AliasTestTableSize>;
-using AliasTestAliasAccessor = ArrayAccessor<uint32_t, AliasTestTableSize>;
-using AliasTestPdfAccessor = ArrayAccessor<float32_t, AliasTestTableSize>;
+using AliasTestPdfAccessor        = ArrayAccessor<float32_t, AliasTestTableSize>;
+using AliasTestPackedWordAccessor = ArrayAccessor<uint32_t, AliasTestTableSize>;
 
-using AliasTestSampler = sampling::AliasTable<float32_t, float32_t, uint32_t, AliasTestProbAccessor, AliasTestAliasAccessor, AliasTestPdfAccessor>;
+// Dedicated struct-valued accessor for PackedAliasEntryB. Field-wise copy
+// sidesteps HLSL's struct functional-cast ambiguity.
+struct AliasTestEntryBAccessor
+{
+	using value_type = sampling::PackedAliasEntryB<float32_t>;
+
+	template<typename V, typename I>
+	void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC
+	{
+		val.packedWord = data[i].packedWord;
+		val.ownPdf     = data[i].ownPdf;
+	}
+
+	value_type data[AliasTestTableSize];
+};
 
 struct AliasTableInputValues
 {
@@ -22,7 +38,7 @@ struct AliasTableInputValues
 
 struct AliasTableTestResults
 {
-	uint32_t generatedIndex;
+	uint32_t  generatedIndex;
 	float32_t forwardPdf;
 	float32_t backwardPdf;
 	float32_t forwardWeight;
@@ -31,24 +47,55 @@ struct AliasTableTestResults
 };
 
 // Pre-computed alias table for weights {1, 2, 3, 4}:
-//   pdf  = {0.1, 0.2, 0.3, 0.4}
-//   prob = {0.4, 0.8, 1.0, 0.8}
-//   alias = {3, 3, 2, 2}
-struct AliasTableTestExecutor
+//   pdf       = {0.1, 0.2, 0.3, 0.4}
+//   stayProb  = {0.4, 0.8, 1.0, 0.8}
+//   alias     = {3,   3,   2,   2}
+//
+// Log2N = 2 unorm encoding (30 bits for stayProb, 2 bits for alias):
+//   packedWord = (alias & 0x3) | (round(stayProb * ((1u<<30) - 1)) << 2)
+//   bin 0: (3) | (429496729  << 2) = 0x66666667
+//   bin 1: (3) | (858993458  << 2) = 0xCCCCCCCB
+//   bin 2: (2) | (1073741823 << 2) = 0xFFFFFFFE
+//   bin 3: (2) | (858993458  << 2) = 0xCCCCCCCA
+
+struct PackedAliasATestExecutor
+{
+	void operator()(NBL_CONST_REF_ARG(AliasTableInputValues) input, NBL_REF_ARG(AliasTableTestResults) output)
+	{
+		AliasTestPackedWordAccessor wordAcc;
+		wordAcc.data[0] = 0x66666667u;
+		wordAcc.data[1] = 0xCCCCCCCBu;
+		wordAcc.data[2] = 0xFFFFFFFEu;
+		wordAcc.data[3] = 0xCCCCCCCAu;
+
+		AliasTestPdfAccessor pdfAcc;
+		pdfAcc.data[0] = 0.1f;
+		pdfAcc.data[1] = 0.2f;
+		pdfAcc.data[2] = 0.3f;
+		pdfAcc.data[3] = 0.4f;
+
+		using Sampler = sampling::PackedAliasTableA<float32_t, float32_t, uint32_t, AliasTestPackedWordAccessor, AliasTestPdfAccessor, AliasTestLog2N>;
+		Sampler sampler = Sampler::create(wordAcc, pdfAcc, AliasTestTableSize);
+
+		Sampler::cache_type cache;
+		output.generatedIndex  = sampler.generate(input.u, cache);
+		output.forwardPdf      = sampler.forwardPdf(input.u, cache);
+		output.backwardPdf     = sampler.backwardPdf(output.generatedIndex);
+		output.forwardWeight   = sampler.forwardWeight(input.u, cache);
+		output.backwardWeight  = sampler.backwardWeight(output.generatedIndex);
+		output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf;
+	}
+};
+
+struct PackedAliasBTestExecutor
 {
 	void operator()(NBL_CONST_REF_ARG(AliasTableInputValues) input, NBL_REF_ARG(AliasTableTestResults) output)
 	{
-		AliasTestProbAccessor probAcc;
-		probAcc.data[0] = 0.4f;
-		probAcc.data[1] = 0.8f;
-		probAcc.data[2] = 1.0f;
-		probAcc.data[3] = 0.8f;
-
-		AliasTestAliasAccessor aliasAcc;
-		aliasAcc.data[0] = 3u;
-		aliasAcc.data[1] = 3u;
-		aliasAcc.data[2] = 2u;
-		aliasAcc.data[3] = 2u;
+		AliasTestEntryBAccessor entryAcc;
+		entryAcc.data[0].packedWord = 0x66666667u; entryAcc.data[0].ownPdf = 0.1f;
+		entryAcc.data[1].packedWord = 0xCCCCCCCBu; entryAcc.data[1].ownPdf = 0.2f;
+		entryAcc.data[2].packedWord = 0xFFFFFFFEu; entryAcc.data[2].ownPdf = 0.3f;
+		entryAcc.data[3].packedWord = 0xCCCCCCCAu; entryAcc.data[3].ownPdf = 0.4f;
 
 		AliasTestPdfAccessor pdfAcc;
 		pdfAcc.data[0] = 0.1f;
@@ -56,14 +103,15 @@ struct AliasTableTestExecutor
 		pdfAcc.data[2] = 0.3f;
 		pdfAcc.data[3] = 0.4f;
 
-		AliasTestSampler sampler = AliasTestSampler::create(probAcc, aliasAcc, pdfAcc, AliasTestTableSize);
+		using Sampler = sampling::PackedAliasTableB<float32_t, float32_t, uint32_t, AliasTestEntryBAccessor, AliasTestPdfAccessor, AliasTestLog2N>;
+		Sampler sampler = Sampler::create(entryAcc, pdfAcc, AliasTestTableSize);
 
-		AliasTestSampler::cache_type cache;
-		output.generatedIndex = sampler.generate(input.u, cache);
-		output.forwardPdf = sampler.forwardPdf(input.u, cache);
-		output.backwardPdf = sampler.backwardPdf(output.generatedIndex);
-		output.forwardWeight = sampler.forwardWeight(input.u, cache);
-		output.backwardWeight = sampler.backwardWeight(output.generatedIndex);
+		Sampler::cache_type cache;
+		output.generatedIndex  = sampler.generate(input.u, cache);
+		output.forwardPdf      = sampler.forwardPdf(input.u, cache);
+		output.backwardPdf     = sampler.backwardPdf(output.generatedIndex);
+		output.forwardWeight   = sampler.forwardWeight(input.u, cache);
+		output.backwardWeight  = sampler.backwardWeight(output.generatedIndex);
 		output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf;
 	}
 };
diff --git a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl
index d5c1d313c..198b72faf 100644
--- a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl
@@ -7,18 +7,20 @@ using namespace nbl::hlsl;
 
 NBL_CONSTEXPR uint32_t WorkgroupSize = WORKGROUP_SIZE;
 
-struct AliasTablePushConstants
+struct CumProbPushConstants
 {
-	uint64_t probAddress;		// float probability[N]
-	uint64_t aliasAddress;		// uint32_t alias[N]
-	uint64_t pdfAddress;		// float pdf[N]
+	uint64_t cumProbAddress;	// float cumProb[N-1]
 	uint64_t outputAddress;		// uint32_t acc[threadCount]
 	uint32_t tableSize;			// N
 };
 
-struct CumProbPushConstants
+// Variants A and B both take the entry array plus a separate pdf[] array
+// (A: 4 B words, B: 8 B {packedWord, ownPdf}; pdf[] has the same contents in
+// both but is tapped independently by the sampler).
+struct PackedAliasABPushConstants
 {
-	uint64_t cumProbAddress;	// float cumProb[N-1]
+	uint64_t entriesAddress;	// A: uint32_t words[N] (4 B); B: PackedAliasEntryB<float>[N] (8 B)
+	uint64_t pdfAddress;		// float pdf[N]
 	uint64_t outputAddress;		// uint32_t acc[threadCount]
 	uint32_t tableSize;			// N
 };
diff --git a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl
index 1828139d4..d3cd09326 100644
--- a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl
@@ -59,7 +59,7 @@ struct SphericalTriangleTestExecutor
 			output.backwardWeight = sampler.backwardWeight(output.generated);
 		}
 		// Roundtrip error: ||u - u'||
-		output.roundtripError = nbl::hlsl::abs(input.u - output.inverted);.
+		output.roundtripError = nbl::hlsl::abs(input.u - output.inverted);
 		output.jacobianProduct = computeJacobianProduct<JACOBIAN_PLAIN>(sampler, input.u, 1e-3f, 20.0f);
 
 		// Domain preservation:
diff --git a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl
deleted file mode 100644
index 67047f997..000000000
--- a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl
+++ /dev/null
@@ -1,74 +0,0 @@
-#pragma shader_stage(compute)
-
-#include <nbl/builtin/hlsl/glsl_compat/core.hlsl>
-
-#ifdef BENCH_ITERS
-#include "../common/discrete_sampler_bench.hlsl"
-#include <nbl/builtin/hlsl/sampling/alias_table.hlsl>
-
-[[vk::push_constant]] AliasTablePushConstants pc;
-
-struct BdaProbabilityAccessor
-{
-	template<typename V, typename I NBL_FUNC_REQUIRES(is_floating_point_v<V> && is_integral_v<I>)
-	void get(I i, NBL_REF_ARG(V) val) { val = vk::RawBufferLoad<V>(addr + uint64_t(sizeof(V)) * uint64_t(i)); }
-	uint64_t addr;
-};
-
-struct BdaAliasIndexAccessor
-{
-	template<typename V, typename I NBL_FUNC_REQUIRES(is_integral_v<V> && is_integral_v<I>)
-	void get(I i, NBL_REF_ARG(V) val) { val = vk::RawBufferLoad<V>(addr + uint64_t(sizeof(V)) * uint64_t(i)); }
-	uint64_t addr;
-};
-
-struct BdaPdfAccessor
-{
-	template<typename V, typename I NBL_FUNC_REQUIRES(is_floating_point_v<V> && is_integral_v<I>)
-	void get(I i, NBL_REF_ARG(V) val) { val = vk::RawBufferLoad<V>(addr + uint64_t(sizeof(V)) * uint64_t(i)); }
-	uint64_t addr;
-};
-
-using BenchAliasTable = sampling::AliasTable<float32_t, float32_t, uint32_t, BdaProbabilityAccessor, BdaAliasIndexAccessor, BdaPdfAccessor>;
-#else
-#include "../common/alias_table.hlsl"
-
-[[vk::binding(0, 0)]] RWStructuredBuffer<AliasTableInputValues> inputTestValues;
-[[vk::binding(1, 0)]] RWStructuredBuffer<AliasTableTestResults> outputTestValues;
-#endif
-
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
-[numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")]
-void main()
-{
-	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
-
-#ifdef BENCH_ITERS
-	BdaProbabilityAccessor probAcc;
-	probAcc.addr = pc.probAddress;
-	BdaAliasIndexAccessor aliasAcc;
-	aliasAcc.addr = pc.aliasAddress;
-	BdaPdfAccessor pdfAcc;
-	pdfAcc.addr = pc.pdfAddress;
-	BenchAliasTable sampler = BenchAliasTable::create(probAcc, aliasAcc, pdfAcc, pc.tableSize);
-
-	float32_t xi = float32_t(nbl::hlsl::glsl::bitfieldReverse(invID)) / float32_t(~0u);
-	NBL_CONSTEXPR float32_t goldenRatio = 0.6180339887498949f;
-	uint32_t acc = 0u;
-	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
-	{
-		xi = frac(xi + goldenRatio);
-		BenchAliasTable::cache_type cache;
-		uint32_t generated = sampler.generate(xi, cache);
-		acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache));
-	}
-
-	vk::RawBufferStore<uint32_t>(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc);
-#else
-	AliasTableTestExecutor executor;
-	executor(inputTestValues[invID], outputTestValues[invID]);
-#endif
-}
diff --git a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl
index 03ac7b36a..438eea31e 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl
@@ -15,11 +15,7 @@
 #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")]
 void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl
index 6189d4658..1fb5f6644 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl
@@ -15,11 +15,7 @@
 #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")]
 void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl
index 649c323b2..2a7f1861e 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl
@@ -15,11 +15,7 @@
 #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")]
 void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl
index 1091ee447..f06613b49 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl
@@ -12,13 +12,18 @@ struct BdaCumProbAccessor
 {
 	using value_type = float32_t;
 	template<typename V, typename I>
-	void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC { val = V(vk::RawBufferLoad<value_type>(addr + uint64_t(sizeof(value_type)) * uint64_t(i))); }
-	value_type operator[](uint32_t i) NBL_CONST_MEMBER_FUNC { value_type v; get<value_type, uint32_t>(i, v); return v; }
+	void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC { val = V(vk::RawBufferLoad<value_type>(addr + uint64_t(sizeof(value_type)) * uint64_t(i), sizeof(value_type))); }
 
 	uint64_t addr;
 };
 
-using BenchCumProbSampler = sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, BdaCumProbAccessor>;
+#if defined(NBL_CUMPROB_EYTZINGER)
+using BenchCumProbSampler = sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, BdaCumProbAccessor, sampling::CumulativeProbabilityMode::EYTZINGER>;
+#elif defined(NBL_CUMPROB_YOLO_READS)
+using BenchCumProbSampler = sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, BdaCumProbAccessor, sampling::CumulativeProbabilityMode::YOLO>;
+#else
+using BenchCumProbSampler = sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, BdaCumProbAccessor, sampling::CumulativeProbabilityMode::TRACKING>;
+#endif
 #else
 #include "../common/cumulative_probability.hlsl"
 
@@ -26,11 +31,7 @@ using BenchCumProbSampler = sampling::CumulativeProbabilitySampler<float32_t, fl
 [[vk::binding(1, 0)]] RWStructuredBuffer<CumProbTestResults> outputTestValues;
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")]
 void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl
index 17cf83ac5..7b97645b5 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl
@@ -15,11 +15,7 @@
 #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")]
 void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl
new file mode 100644
index 000000000..b0dbeedac
--- /dev/null
+++ b/37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl
@@ -0,0 +1,114 @@
+#pragma shader_stage(compute)
+
+#include <nbl/builtin/hlsl/glsl_compat/core.hlsl>
+
+#ifdef BENCH_ITERS
+#include "../common/discrete_sampler_bench.hlsl"
+#include <nbl/builtin/hlsl/sampling/alias_table.hlsl>
+
+[[vk::push_constant]] PackedAliasABPushConstants pc;
+
+// Log2N bucket. Covers all sweep sizes up to 2^LOG2N buckets without precision
+// loss. The same value must be passed to the host-side packA<Log2N>() /
+// packB<Log2N>() call so the bit layouts match.
+NBL_CONSTEXPR uint32_t LOG2N_BUCKET = 26;
+
+// Variant A accessor: 4 B packed words.
+struct BdaPackedWordAccessor
+{
+	using value_type = uint32_t;
+
+	template<typename V, typename I NBL_FUNC_REQUIRES(is_integral_v<V> && is_integral_v<I>)
+	void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC
+	{
+		val = vk::RawBufferLoad<V>(addr + uint64_t(sizeof(V)) * uint64_t(i), sizeof(V));
+	}
+
+	uint64_t addr;
+};
+
+// Variant B accessor: 8 B PackedAliasEntryB. Loads a uint2 and decomposes it
+// into the POD entry so DXC never sees a bitfield — avoids the Insert/Extract
+// round-trip we observed when the sampler read from a bitfield struct.
+struct BdaPackedAliasBAccessor
+{
+	using value_type = nbl::hlsl::sampling::PackedAliasEntryB<float32_t>;
+
+	template<typename V, typename I NBL_FUNC_REQUIRES(is_integral_v<I>)
+	void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC
+	{
+		const uint64_t loadAddr = addr + uint64_t(8u) * uint64_t(i);
+		const uint2 raw = vk::RawBufferLoad<uint2>(loadAddr, 8u);
+		val.packedWord = raw.x;
+		val.ownPdf = asfloat(raw.y);
+	}
+
+	uint64_t addr;
+};
+
+// Separate 4 B pdf[] accessor.
+struct BdaPdfAccessor
+{
+	using value_type = float32_t;
+
+	template<typename V, typename I NBL_FUNC_REQUIRES(is_floating_point_v<V> && is_integral_v<I>)
+	void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC
+	{
+		val = vk::RawBufferLoad<V>(addr + uint64_t(sizeof(V)) * uint64_t(i), sizeof(V));
+	}
+
+	uint64_t addr;
+};
+
+#ifdef NBL_PACKED_ALIAS_B
+using BenchPackedAlias = nbl::hlsl::sampling::PackedAliasTableB<float32_t, float32_t, uint32_t, BdaPackedAliasBAccessor, BdaPdfAccessor, LOG2N_BUCKET>;
+#else
+using BenchPackedAlias = nbl::hlsl::sampling::PackedAliasTableA<float32_t, float32_t, uint32_t, BdaPackedWordAccessor, BdaPdfAccessor, LOG2N_BUCKET>;
+#endif
+
+#else
+#include "../common/alias_table.hlsl"
+
+[[vk::binding(0, 0)]] RWStructuredBuffer<AliasTableInputValues> inputTestValues;
+[[vk::binding(1, 0)]] RWStructuredBuffer<AliasTableTestResults> outputTestValues;
+#endif
+
+[numthreads(WORKGROUP_SIZE, 1, 1)]
+void main()
+{
+	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
+
+#ifdef BENCH_ITERS
+#ifdef NBL_PACKED_ALIAS_B
+	BdaPackedAliasBAccessor entryAcc;
+#else
+	BdaPackedWordAccessor entryAcc;
+#endif
+	entryAcc.addr = pc.entriesAddress;
+	BdaPdfAccessor pdfAcc;
+	pdfAcc.addr = pc.pdfAddress;
+	BenchPackedAlias sampler = BenchPackedAlias::create(entryAcc, pdfAcc, pc.tableSize);
+
+	float32_t xi = float32_t(nbl::hlsl::glsl::bitfieldReverse(invID)) / float32_t(~0u);
+	NBL_CONSTEXPR float32_t goldenRatio = 0.6180339887498949f;
+	uint32_t acc = 0u;
+
+	[loop]
+	for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++)
+	{
+		xi = frac(xi + goldenRatio);
+		BenchPackedAlias::cache_type cache;
+		uint32_t generated = sampler.generate(xi, cache);
+		acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache));
+	}
+
+	vk::RawBufferStore<uint32_t>(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc);
+#else
+#ifdef NBL_PACKED_ALIAS_B
+	PackedAliasBTestExecutor executor;
+#else
+	PackedAliasATestExecutor executor;
+#endif
+	executor(inputTestValues[invID], outputTestValues[invID]);
+#endif
+}
diff --git a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl
index e0cf7aea0..b5d48cc36 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl
@@ -15,11 +15,7 @@
 #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")]
 void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl
index d1ef313e5..f543d6dc2 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl
@@ -15,11 +15,7 @@
 #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")]
 void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl
index 9b8c234c4..ca4e7eef7 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl
@@ -15,11 +15,7 @@
 #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")]
 void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl
index ca9b4d43e..fc4ae03b7 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl
@@ -17,12 +17,8 @@
 #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")] void
-main()
+void main()
 {
    const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
 #ifdef BENCH_ITERS
diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl
index 3d8ec8961..e32251ed8 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl
@@ -15,11 +15,7 @@
 #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")]
 void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl
index b9766d5ff..542d20587 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl
@@ -17,12 +17,8 @@
 #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")] void
-main()
+void main()
 {
    const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
 #ifdef BENCH_ITERS
diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl
index 3595ac86a..bc55facbd 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl
@@ -15,9 +15,7 @@
 #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
+
 [numthreads(WORKGROUP_SIZE, 1, 1)]
 void main()
 {
diff --git a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl
index cd43c630e..3c832e995 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl
@@ -190,16 +190,6 @@ void main()
    aliasIdx.data[0] = 1u; aliasIdx.data[1] = 2u; aliasIdx.data[2] = 3u; aliasIdx.data[3] = 0u;
    ArrayAccessor<float32_t, 4> aliasPdf;
    aliasPdf.data[0] = 0.25; aliasPdf.data[1] = 0.25; aliasPdf.data[2] = 0.25; aliasPdf.data[3] = 0.25;
-   sampling::AliasTable<float32_t, float32_t, uint32_t, ArrayAccessor<float32_t, 4>, ArrayAccessor<uint32_t, 4>, ArrayAccessor<float32_t, 4> > aliasTable =
-      sampling::AliasTable<float32_t, float32_t, uint32_t, ArrayAccessor<float32_t, 4>, ArrayAccessor<uint32_t, 4>, ArrayAccessor<float32_t, 4> >::create(aliasProb, aliasIdx, aliasPdf, 4u);
-   sampling::AliasTable<float32_t, float32_t, uint32_t, ArrayAccessor<float32_t, 4>, ArrayAccessor<uint32_t, 4>, ArrayAccessor<float32_t, 4> >::cache_type aliasCache;
-   uint32_t aliasBin0 = aliasTable.generate(0.3);
-   uint32_t aliasBin = aliasTable.generate(0.3, aliasCache);
-   acc.x += float32_t(aliasBin0 + aliasBin);
-   acc.x += aliasTable.forwardPdf(0.3, aliasCache);
-   acc.x += aliasTable.forwardWeight(0.3, aliasCache);
-   acc.x += aliasTable.backwardPdf(aliasBin);
-   acc.x += aliasTable.backwardWeight(aliasBin);
 
    // CumulativeProbabilitySampler — generate (with/without cache), forwardPdf, backwardPdf, forwardWeight, backwardWeight
    ArrayAccessor<float32_t, 3> cumProb;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl
index 3c43ee119..c0a0e58b2 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl
@@ -15,11 +15,7 @@
 #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")]
 void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl
index 5879e28bb..1c810afbf 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl
@@ -15,11 +15,7 @@
 #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS)
 #endif
 
-#ifndef WORKGROUP_SIZE
-#define WORKGROUP_SIZE 64
-#endif
 [numthreads(WORKGROUP_SIZE, 1, 1)]
-[shader("compute")]
 void main()
 {
 	const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x;
diff --git a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h
index 02fbf58d2..b2a2fad9a 100644
--- a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h
+++ b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h
@@ -11,97 +11,103 @@
 
 using namespace nbl;
 
-// Benchmarks alias table vs cumulative probability sampler on the GPU using BDA.
-// Builds pipelines once, then sweeps a list of table sizes. For each N it builds
-// both tables from the same weight distribution, uploads via BDA buffers, and
-// measures GPU throughput using timestamp queries. The cumulative probability
-// sampler is run in two variants: the stateful-comparator cache population
-// (default) and the "YOLO re-read" variant (cumulative_probability.hlsl).
 class CDiscreteSamplerBenchmark
 {
    public:
    struct SetupData
    {
-      core::smart_refctd_ptr<video::ILogicalDevice> device;
-      core::smart_refctd_ptr<video::CVulkanConnection> api;
-      core::smart_refctd_ptr<asset::IAssetManager> assetMgr;
-      core::smart_refctd_ptr<system::ILogger> logger;
-      video::IPhysicalDevice* physicalDevice;
-      std::string aliasShaderKey;
-      std::string cumProbShaderKey;
-      std::string cumProbYoloShaderKey;
-      uint32_t computeFamilyIndex;
-      uint32_t dispatchGroupCount;
+      core::smart_refctd_ptr<ILogicalDevice>    device;
+      core::smart_refctd_ptr<CVulkanConnection> api;
+      core::smart_refctd_ptr<IAssetManager>     assetMgr;
+      core::smart_refctd_ptr<ILogger>          logger;
+      IPhysicalDevice*                          physicalDevice;
+      std::string                                      packedAliasAShaderKey;
+      std::string                                      packedAliasBShaderKey;
+      std::string                                      cumProbShaderKey;
+      std::string                                      cumProbYoloShaderKey;
+      std::string                                      cumProbEytzingerShaderKey;
+      uint32_t                                         computeFamilyIndex;
+      uint32_t                                         dispatchGroupCount;
    };
 
    void setup(const SetupData& data)
    {
-      m_device = data.device;
-      m_logger = data.logger;
-      m_assetMgr = data.assetMgr;
+      m_device             = data.device;
+      m_logger             = data.logger;
+      m_assetMgr           = data.assetMgr;
       m_dispatchGroupCount = data.dispatchGroupCount;
-      m_physicalDevice = data.physicalDevice;
+      m_physicalDevice     = data.physicalDevice;
 
       m_queue = m_device->getQueue(data.computeFamilyIndex, 0);
 
+      // Staging-upload utility. Without this, BDA buffers land in host-visible (system RAM)
+      // and every sampler load becomes a PCIe round-trip instead of hitting VRAM/L2.
+      m_utils = IUtilities::create(core::smart_refctd_ptr(m_device), core::smart_refctd_ptr(m_logger));
+
       // Command pool + buffers
-      m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-      m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchCmdbuf);
+      m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+      m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchCmdbuf);
 
       // Timestamp query pool
       {
-         video::IQueryPool::SCreationParams qp = {};
-         qp.queryType = video::IQueryPool::TYPE::TIMESTAMP;
-         qp.queryCount = 2;
-         qp.pipelineStatisticsFlags = video::IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
-         m_queryPool = m_device->createQueryPool(qp);
+         IQueryPool::SCreationParams qp = {};
+         qp.queryType                          = IQueryPool::TYPE::TIMESTAMP;
+         qp.queryCount                         = 2;
+         qp.pipelineStatisticsFlags            = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
+         m_queryPool                           = m_device->createQueryPool(qp);
       }
 
       const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE;
 
-      // Shared output buffer (size only depends on thread count)
+      // Shared output buffer (size only depends on thread count). GPU writes via BDA and
+      // nothing reads it on the CPU, so pin it to device-local VRAM.
       {
-         video::IGPUBuffer::SCreationParams bp = {};
-         bp.size = totalThreads * sizeof(uint32_t);
-         bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) |
-            video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-         m_outputBuf = m_device->createBuffer(std::move(bp));
-         video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputBuf->getMemoryReqs();
-         reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits();
-         m_device->allocate(reqs, m_outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+         IGPUBuffer::SCreationParams bp                      = {};
+         bp.size                                                    = totalThreads * sizeof(uint32_t);
+         bp.usage                                                   = core::bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+         m_outputBuf                                                = m_device->createBuffer(std::move(bp));
+         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputBuf->getMemoryReqs();
+         reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits();
+         m_device->allocate(reqs, m_outputBuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
       }
 
       // Pipelines (N-independent; only push constants change per run)
-      m_aliasPipeline = createPipeline<AliasTablePushConstants>(data.aliasShaderKey, m_aliasPplnLayout, "alias");
-      m_cumProbPipeline = createPipeline<CumProbPushConstants>(data.cumProbShaderKey, m_cumProbPplnLayout, "cumprob-comparator");
-      m_cumProbYoloPipeline = createPipeline<CumProbPushConstants>(data.cumProbYoloShaderKey, m_cumProbYoloPplnLayout, "cumprob-yolo");
+      m_packedAliasAPipeline     = createPipeline<PackedAliasABPushConstants>(data.packedAliasAShaderKey, m_packedAliasAPplnLayout, "alias-packed-A");
+      m_packedAliasBPipeline     = createPipeline<PackedAliasABPushConstants>(data.packedAliasBShaderKey, m_packedAliasBPplnLayout, "alias-packed-B");
+      m_cumProbPipeline          = createPipeline<CumProbPushConstants>(data.cumProbShaderKey, m_cumProbPplnLayout, "cumprob-comparator");
+      m_cumProbYoloPipeline      = createPipeline<CumProbPushConstants>(data.cumProbYoloShaderKey, m_cumProbYoloPplnLayout, "cumprob-yolo");
+      m_cumProbEytzingerPipeline = createPipeline<CumProbPushConstants>(data.cumProbEytzingerShaderKey, m_cumProbEytzingerPplnLayout, "cumprob-eytzinger");
    }
 
    // DispatchScheduler: uint32_t N -> std::pair<uint32_t warmup, uint32_t bench>.
    // Lets the caller trade wall-clock for statistical stability per size:
    // big-N runs are DRAM-bound and need fewer dispatches to hit the same total sample count.
-   struct DispatchCounts { uint32_t warmup; uint32_t bench; };
+   struct DispatchCounts
+   {
+      uint32_t warmup;
+      uint32_t bench;
+   };
 
-   // Sweep a list of table sizes. For each N: build tables from a fresh weight
-   // distribution (deterministic seed = 42 + N so different N's get distinct
-   // distributions but runs are reproducible), upload via BDA, then run all
-   // three samplers with the dispatch counts chosen by `scheduler`.
    template<typename DispatchScheduler>
    void runSweep(const std::vector<uint32_t>& tableSizes, DispatchScheduler scheduler)
    {
       const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE;
       m_logger->log("=== GPU Discrete Sampler Benchmark sweep (%u threads * %u iters/thread; wg=%u; dispatches chosen per-N) ===",
-         system::ILogger::ELL_PERFORMANCE, totalThreads, BENCH_ITERS, WORKGROUP_SIZE);
-      m_logger->log("%12s | %-28s | %12s | %12s | %12s | %10s",
-         system::ILogger::ELL_PERFORMANCE, "N", "Sampler", "ps/sample", "GSamples/s", "ms total", "dispatches");
+         ILogger::ELL_PERFORMANCE, totalThreads, BENCH_ITERS, WORKGROUP_SIZE);
+      m_logger->log("%12s | %-34s | %12s | %12s | %12s | %10s", ILogger::ELL_PERFORMANCE,
+         "N", "Sampler", "ps/sample", "GSamples/s", "ms total", "dispatches");
 
       for (uint32_t N : tableSizes)
       {
          const DispatchCounts dc = scheduler(N);
          buildAndUpload(N);
-         runSingle(N, "AliasTable",                    m_aliasPipeline,       m_aliasPplnLayout,       SamplerKind::Alias,           dc.warmup, dc.bench);
-         runSingle(N, "CumulativeProbability",         m_cumProbPipeline,     m_cumProbPplnLayout,     SamplerKind::CumProbCompare,  dc.warmup, dc.bench);
-         runSingle(N, "CumulativeProbability (YOLO)",  m_cumProbYoloPipeline, m_cumProbYoloPplnLayout, SamplerKind::CumProbYolo,     dc.warmup, dc.bench);
+         // Packed A wins N<=16k; Packed B wins N>=32k. SoA and Packed C were dominated
+         // across every N measured, removed from the sweep.
+         runSingle(N, "AliasTable (packed A, 4 B)", m_packedAliasAPipeline, m_packedAliasAPplnLayout, SamplerKind::AliasPackedA, dc.warmup, dc.bench);
+         runSingle(N, "AliasTable (packed B, 8 B)", m_packedAliasBPipeline, m_packedAliasBPplnLayout, SamplerKind::AliasPackedB, dc.warmup, dc.bench);
+         runSingle(N, "CumulativeProbability", m_cumProbPipeline, m_cumProbPplnLayout, SamplerKind::CumProbCompare, dc.warmup, dc.bench);
+         runSingle(N, "CumulativeProbability (YOLO)", m_cumProbYoloPipeline, m_cumProbYoloPplnLayout, SamplerKind::CumProbYolo, dc.warmup, dc.bench);
+         runSingle(N, "CumulativeProbability (Eytzinger)", m_cumProbEytzingerPipeline, m_cumProbEytzingerPplnLayout, SamplerKind::CumProbEytzinger, dc.warmup, dc.bench);
          releaseTables();
       }
    }
@@ -109,76 +115,74 @@ class CDiscreteSamplerBenchmark
    // Convenience: sweep with fixed dispatch counts for every size.
    void runSweep(const std::vector<uint32_t>& tableSizes, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000)
    {
-      runSweep(tableSizes, [warmupIterations, benchmarkIterations](uint32_t) -> DispatchCounts {
-         return {warmupIterations, benchmarkIterations};
-      });
+      runSweep(tableSizes, [warmupIterations, benchmarkIterations](uint32_t) -> DispatchCounts
+         { return {warmupIterations, benchmarkIterations}; });
    }
 
    private:
-   enum class SamplerKind { Alias, CumProbCompare, CumProbYolo };
+   enum class SamplerKind
+   {
+      AliasPackedA,
+      AliasPackedB,
+      CumProbCompare,
+      CumProbYolo,
+      CumProbEytzinger
+   };
 
    template<typename PushConstantT>
-   core::smart_refctd_ptr<video::IGPUComputePipeline> createPipeline(const std::string& shaderKey, core::smart_refctd_ptr<video::IGPUPipelineLayout>& outLayout, const char* tag)
+   core::smart_refctd_ptr<IGPUComputePipeline> createPipeline(const std::string& shaderKey, core::smart_refctd_ptr<IGPUPipelineLayout>& outLayout, const char* tag)
    {
-      const asset::SPushConstantRange pcRange = {
-         .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
-         .offset = 0,
-         .size = sizeof(PushConstantT)};
+      const SPushConstantRange pcRange = {
+         .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+         .offset     = 0,
+         .size       = sizeof(PushConstantT)};
       auto layout = m_device->createPipelineLayout({&pcRange, 1});
       if (!layout)
-         m_logger->log("CDiscreteSamplerBenchmark: failed to create %s pipeline layout", system::ILogger::ELL_ERROR, tag);
-
-      asset::IAssetLoader::SAssetLoadParams lp = {};
-      lp.logger = m_logger.get();
-      lp.workingDirectory = "app_resources";
-      auto bundle = m_assetMgr->getAsset(shaderKey, lp);
-      auto source = asset::IAsset::castDown<asset::IShader>(bundle.getContents()[0]);
-      auto shader = m_device->compileShader({.source = source.get()});
+         m_logger->log("CDiscreteSamplerBenchmark: failed to create %s pipeline layout", ILogger::ELL_ERROR, tag);
+
+      IAssetLoader::SAssetLoadParams lp = {};
+      lp.logger                                = m_logger.get();
+      lp.workingDirectory                      = "app_resources";
+      auto bundle                              = m_assetMgr->getAsset(shaderKey, lp);
+      auto source                              = IAsset::castDown<IShader>(bundle.getContents()[0]);
+      auto shader                              = m_device->compileShader({.source = source.get()});
       if (!shader)
-         m_logger->log("CDiscreteSamplerBenchmark: failed to load %s shader", system::ILogger::ELL_ERROR, tag);
+         m_logger->log("CDiscreteSamplerBenchmark: failed to load %s shader", ILogger::ELL_ERROR, tag);
 
-      video::IGPUComputePipeline::SCreationParams pp = {};
-      pp.layout = layout.get();
-      pp.shader.shader = shader.get();
-      pp.shader.entryPoint = "main";
+      IGPUComputePipeline::SCreationParams pp = {};
+      pp.layout                                      = layout.get();
+      pp.shader.shader                               = shader.get();
+      pp.shader.entryPoint                           = "main";
       if (m_device->getEnabledFeatures().pipelineExecutableInfo)
       {
-         pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS;
+         pp.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS;
       }
 
-      core::smart_refctd_ptr<video::IGPUComputePipeline> pipeline;
+      core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
       if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &pipeline))
-         m_logger->log("CDiscreteSamplerBenchmark: failed to create %s compute pipeline", system::ILogger::ELL_ERROR, tag);
+         m_logger->log("CDiscreteSamplerBenchmark: failed to create %s compute pipeline", ILogger::ELL_ERROR, tag);
 
       if (m_device->getEnabledFeatures().pipelineExecutableInfo)
       {
          auto report = system::to_string(pipeline->getExecutableInfo());
-         m_logger->log("%s Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, tag, report.c_str());
+         m_logger->log("%s Sampling Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, tag, report.c_str());
       }
       outLayout = std::move(layout);
       return pipeline;
    }
 
-   core::smart_refctd_ptr<video::IGPUBuffer> createBdaBuffer(const void* srcData, size_t bytes)
+   core::smart_refctd_ptr<IGPUBuffer> createBdaBuffer(const void* srcData, size_t bytes)
    {
-      video::IGPUBuffer::SCreationParams bp = {};
-      bp.size = bytes;
-      bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) |
-         video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-      auto buf = m_device->createBuffer(std::move(bp));
-
-      video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buf->getMemoryReqs();
-      reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
-      auto alloc = m_device->allocate(reqs, buf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-
-      const auto allocSize = alloc.memory->getAllocationSize();
-      if (alloc.memory->map({0ull, allocSize}, video::IDeviceMemoryAllocation::EMCAF_WRITE))
-      {
-         std::memcpy(alloc.memory->getMappedPointer(), srcData, bytes);
-         video::ILogicalDevice::MappedMemoryRange flushRange(alloc.memory.get(), 0ull, allocSize);
-         m_device->flushMappedMemoryRanges(1u, &flushRange);
-         alloc.memory->unmap();
-      }
+      IGPUBuffer::SCreationParams bp = {};
+      bp.size                               = bytes;
+      bp.usage                              = core::bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) |
+         IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT |
+         IGPUBuffer::EUF_TRANSFER_DST_BIT;
+
+      core::smart_refctd_ptr<IGPUBuffer> buf;
+      auto                                      future = m_utils->createFilledDeviceLocalBufferOnDedMem(
+         SIntendedSubmitInfo {.queue = m_queue}, std::move(bp), srcData);
+      future.move_into(buf);
       return buf;
    }
 
@@ -186,46 +190,53 @@ class CDiscreteSamplerBenchmark
    {
       m_currentN = N;
 
-      std::vector<float> weights(N);
-      std::mt19937 rng(42u + N);
+      std::vector<float>                    weights(N);
+      std::mt19937                          rng(42u + N);
       std::uniform_real_distribution<float> dist(0.001f, 100.0f);
       for (uint32_t i = 0; i < N; i++)
          weights[i] = dist(rng);
 
-      // Alias table
-      std::vector<float> aliasProb(N);
-      std::vector<uint32_t> aliasIdx(N);
-      std::vector<float> aliasPdf(N);
-      std::vector<uint32_t> workspace(N);
-      nbl::hlsl::sampling::AliasTableBuilder<float>::build({weights}, aliasProb.data(), aliasIdx.data(), aliasPdf.data(), workspace.data());
+      // Build the alias table SoA (intermediate form), then pack it for variants A and B.
+      // Builder may pad PoT N to N+1 for cache-friendly stride; returned size drives
+      // every downstream buffer / push-constant value.
+      std::vector<float>    aliasProb;
+      std::vector<uint32_t> aliasIdx;
+      std::vector<float>    aliasPdf;
+      m_aliasTableN = sampling::AliasTableBuilder<float>::build({weights}, aliasProb, aliasIdx, aliasPdf);
+
+      constexpr uint32_t                                         kPackedLog2N = 26u;
+      std::vector<uint32_t>                                      packedA(m_aliasTableN);
+      std::vector<sampling::PackedAliasEntryB<float>> packedB(m_aliasTableN);
+      sampling::AliasTableBuilder<float>::packA<kPackedLog2N>({aliasProb}, {aliasIdx}, packedA.data());
+      sampling::AliasTableBuilder<float>::packB<kPackedLog2N>({aliasProb}, {aliasIdx}, {aliasPdf}, packedB.data());
 
       // Cumulative probability (N-1 entries, last bucket implicitly 1.0)
-      std::vector<float> cumProb(N > 0 ? N - 1 : 0);
-      nbl::hlsl::sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data());
-
-      m_aliasProbBuf = createBdaBuffer(aliasProb.data(), N * sizeof(float));
-      m_aliasIdxBuf  = createBdaBuffer(aliasIdx.data(), N * sizeof(uint32_t));
-      m_aliasPdfBuf  = createBdaBuffer(aliasPdf.data(), N * sizeof(float));
-      const size_t cumProbBytes = (N > 0 ? (N - 1) : 0) * sizeof(float);
-      m_cumProbBuf = cumProbBytes ? createBdaBuffer(cumProb.data(), cumProbBytes) : nullptr;
+      std::vector<float> cumProb(N - 1u);
+      sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data());
+
+      // Eytzinger level-order tree: 2*P entries where P = nextPot(N)
+      const uint32_t     eytzingerP        = sampling::eytzingerLeafCount(N);
+      const uint32_t     eytzingerTreeSize = 2u * eytzingerP;
+      std::vector<float> cumProbEytzinger(eytzingerTreeSize);
+      sampling::buildEytzinger({weights}, cumProbEytzinger.data());
+
+      m_aliasPdfBuf         = createBdaBuffer(aliasPdf.data(), m_aliasTableN * sizeof(float));
+      m_packedAliasABuf     = createBdaBuffer(packedA.data(), m_aliasTableN * sizeof(uint32_t));
+      m_packedAliasBBuf     = createBdaBuffer(packedB.data(), m_aliasTableN * sizeof(sampling::PackedAliasEntryB<float>));
+      m_cumProbBuf          = createBdaBuffer(cumProb.data(), (N - 1u) * sizeof(float));
+      m_cumProbEytzingerBuf = createBdaBuffer(cumProbEytzinger.data(), eytzingerTreeSize * sizeof(float));
    }
 
    void releaseTables()
    {
-      m_aliasProbBuf = nullptr;
-      m_aliasIdxBuf  = nullptr;
-      m_aliasPdfBuf  = nullptr;
-      m_cumProbBuf   = nullptr;
+      m_aliasPdfBuf         = nullptr;
+      m_packedAliasABuf     = nullptr;
+      m_packedAliasBBuf     = nullptr;
+      m_cumProbBuf          = nullptr;
+      m_cumProbEytzingerBuf = nullptr;
    }
 
-   void runSingle(
-      uint32_t N,
-      const char* name,
-      const core::smart_refctd_ptr<video::IGPUComputePipeline>& pipeline,
-      const core::smart_refctd_ptr<video::IGPUPipelineLayout>& layout,
-      SamplerKind kind,
-      uint32_t warmupIterations,
-      uint32_t benchmarkIterations)
+   void runSingle(uint32_t N, const char* name, const core::smart_refctd_ptr<IGPUComputePipeline>& pipeline, const core::smart_refctd_ptr<IGPUPipelineLayout>& layout, SamplerKind kind, uint32_t warmupIterations, uint32_t benchmarkIterations)
    {
       m_device->waitIdle();
 
@@ -241,96 +252,103 @@ class CDiscreteSamplerBenchmark
       // don't measure a tail where the GPU is already winding down.
       const uint32_t cooldownIterations = warmupIterations;
 
-      m_benchCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE);
-      m_benchCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+      m_benchCmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
+      m_benchCmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
       m_benchCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
       m_benchCmdbuf->bindComputePipeline(pipeline.get());
 
-      if (kind == SamplerKind::Alias)
+      if (kind == SamplerKind::AliasPackedA || kind == SamplerKind::AliasPackedB)
       {
-         AliasTablePushConstants pc = {};
-         pc.probAddress  = m_aliasProbBuf->getDeviceAddress();
-         pc.aliasAddress = m_aliasIdxBuf->getDeviceAddress();
-         pc.pdfAddress   = m_aliasPdfBuf->getDeviceAddress();
-         pc.outputAddress = m_outputBuf->getDeviceAddress();
-         pc.tableSize = N;
-         m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+         PackedAliasABPushConstants pc = {};
+         pc.entriesAddress             = (kind == SamplerKind::AliasPackedA ? m_packedAliasABuf : m_packedAliasBBuf)->getDeviceAddress();
+         pc.pdfAddress                 = m_aliasPdfBuf->getDeviceAddress();
+         pc.outputAddress              = m_outputBuf->getDeviceAddress();
+         pc.tableSize                  = m_aliasTableN;
+         m_benchCmdbuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
       }
       else
       {
-         CumProbPushConstants pc = {};
-         pc.cumProbAddress = m_cumProbBuf ? m_cumProbBuf->getDeviceAddress() : 0ull;
-         pc.outputAddress  = m_outputBuf->getDeviceAddress();
-         pc.tableSize = N;
-         m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+         CumProbPushConstants pc  = {};
+         const auto&          buf = (kind == SamplerKind::CumProbEytzinger) ? m_cumProbEytzingerBuf : m_cumProbBuf;
+         pc.cumProbAddress        = buf->getDeviceAddress();
+         pc.outputAddress         = m_outputBuf->getDeviceAddress();
+         pc.tableSize             = N;
+         m_benchCmdbuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
       }
 
       for (uint32_t i = 0u; i < warmupIterations; ++i)
          m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
-      m_benchCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0);
+      m_benchCmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0);
       for (uint32_t i = 0u; i < benchmarkIterations; ++i)
          m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
-      m_benchCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1);
+      m_benchCmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1);
       for (uint32_t i = 0u; i < cooldownIterations; ++i)
          m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
       m_benchCmdbuf->end();
 
-      auto semaphore = m_device->createSemaphore(0u);
-      const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_benchCmdbuf.get()}};
-      const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = {
-         {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}};
-      video::IQueue::SSubmitInfo submit = {};
-      submit.commandBuffers = benchCmds;
-      submit.signalSemaphores = signalSem;
+      auto                                                 semaphore   = m_device->createSemaphore(0u);
+      const IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_benchCmdbuf.get()}};
+      const IQueue::SSubmitInfo::SSemaphoreInfo     signalSem[] = {
+         {.semaphore = semaphore.get(), .value = 1u, .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}};
+      IQueue::SSubmitInfo submit = {};
+      submit.commandBuffers             = benchCmds;
+      submit.signalSemaphores           = signalSem;
       m_queue->submit({&submit, 1u});
 
       m_device->waitIdle();
 
-      uint64_t timestamps[2] = {};
-      const auto flags = core::bitflag(video::IQueryPool::RESULTS_FLAGS::_64_BIT) |
-         core::bitflag(video::IQueryPool::RESULTS_FLAGS::WAIT_BIT);
+      uint64_t   timestamps[2] = {};
+      const auto flags         = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) |
+         core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT);
       m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags);
 
-      constexpr uint32_t benchIters = BENCH_ITERS;
-      const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds);
-      const float64_t elapsed_ns = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod;
-      const uint64_t totalThreads = uint64_t(m_dispatchGroupCount) * uint64_t(WORKGROUP_SIZE);
-      const uint64_t totalSamples = uint64_t(benchmarkIterations) * totalThreads * uint64_t(benchIters);
-      const float64_t ps_per_sample = elapsed_ns * 1e3 / float64_t(totalSamples);
-      const float64_t gsamples_per_s = float64_t(totalSamples) / elapsed_ns;
-      const float64_t elapsed_ms = elapsed_ns * 1e-6;
-
-      m_logger->log("%12u | %-28s | %12.3f | %12.3f | %12.3f | %10u",
-         system::ILogger::ELL_PERFORMANCE, N, name, ps_per_sample, gsamples_per_s, elapsed_ms, benchmarkIterations);
+      constexpr uint32_t benchIters      = BENCH_ITERS;
+      const float64_t    timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds);
+      const float64_t    elapsed_ns      = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod;
+      const uint64_t     totalThreads    = uint64_t(m_dispatchGroupCount) * uint64_t(WORKGROUP_SIZE);
+      const uint64_t     totalSamples    = uint64_t(benchmarkIterations) * totalThreads * uint64_t(benchIters);
+      const float64_t    ps_per_sample   = elapsed_ns * 1e3 / float64_t(totalSamples);
+      const float64_t    gsamples_per_s  = float64_t(totalSamples) / elapsed_ns;
+      const float64_t    elapsed_ms      = elapsed_ns * 1e-6;
+
+      m_logger->log("%12u | %-34s | %12.3f | %12.3f | %12.3f | %10u",
+         ILogger::ELL_PERFORMANCE, N, name, ps_per_sample, gsamples_per_s, elapsed_ms, benchmarkIterations);
    }
 
-   core::smart_refctd_ptr<video::ILogicalDevice> m_device;
-   core::smart_refctd_ptr<system::ILogger> m_logger;
-   core::smart_refctd_ptr<asset::IAssetManager> m_assetMgr;
-   core::smart_refctd_ptr<video::IGPUCommandPool> m_cmdpool;
-   core::smart_refctd_ptr<video::IGPUCommandBuffer> m_benchCmdbuf;
-   core::smart_refctd_ptr<video::IQueryPool> m_queryPool;
+   core::smart_refctd_ptr<ILogicalDevice>    m_device;
+   core::smart_refctd_ptr<ILogger>          m_logger;
+   core::smart_refctd_ptr<IAssetManager>     m_assetMgr;
+   core::smart_refctd_ptr<IUtilities>        m_utils;
+   core::smart_refctd_ptr<IGPUCommandPool>   m_cmdpool;
+   core::smart_refctd_ptr<IGPUCommandBuffer> m_benchCmdbuf;
+   core::smart_refctd_ptr<IQueryPool>        m_queryPool;
 
    // Pipelines (set up once)
-   core::smart_refctd_ptr<video::IGPUPipelineLayout> m_aliasPplnLayout;
-   core::smart_refctd_ptr<video::IGPUComputePipeline> m_aliasPipeline;
-   core::smart_refctd_ptr<video::IGPUPipelineLayout> m_cumProbPplnLayout;
-   core::smart_refctd_ptr<video::IGPUComputePipeline> m_cumProbPipeline;
-   core::smart_refctd_ptr<video::IGPUPipelineLayout> m_cumProbYoloPplnLayout;
-   core::smart_refctd_ptr<video::IGPUComputePipeline> m_cumProbYoloPipeline;
-
-   // Per-N data buffers (rebuilt each sweep step)
-   core::smart_refctd_ptr<video::IGPUBuffer> m_aliasProbBuf;
-   core::smart_refctd_ptr<video::IGPUBuffer> m_aliasIdxBuf;
-   core::smart_refctd_ptr<video::IGPUBuffer> m_aliasPdfBuf;
-   core::smart_refctd_ptr<video::IGPUBuffer> m_cumProbBuf;
+   core::smart_refctd_ptr<IGPUPipelineLayout>  m_packedAliasAPplnLayout;
+   core::smart_refctd_ptr<IGPUComputePipeline> m_packedAliasAPipeline;
+   core::smart_refctd_ptr<IGPUPipelineLayout>  m_packedAliasBPplnLayout;
+   core::smart_refctd_ptr<IGPUComputePipeline> m_packedAliasBPipeline;
+   core::smart_refctd_ptr<IGPUPipelineLayout>  m_cumProbPplnLayout;
+   core::smart_refctd_ptr<IGPUComputePipeline> m_cumProbPipeline;
+   core::smart_refctd_ptr<IGPUPipelineLayout>  m_cumProbYoloPplnLayout;
+   core::smart_refctd_ptr<IGPUComputePipeline> m_cumProbYoloPipeline;
+   core::smart_refctd_ptr<IGPUPipelineLayout>  m_cumProbEytzingerPplnLayout;
+   core::smart_refctd_ptr<IGPUComputePipeline> m_cumProbEytzingerPipeline;
+
+   // Per-N data buffers (rebuilt each sweep step). pdf[] is shared between A and B.
+   core::smart_refctd_ptr<IGPUBuffer> m_aliasPdfBuf;
+   core::smart_refctd_ptr<IGPUBuffer> m_packedAliasABuf;
+   core::smart_refctd_ptr<IGPUBuffer> m_packedAliasBBuf;
+   core::smart_refctd_ptr<IGPUBuffer> m_cumProbBuf;
+   core::smart_refctd_ptr<IGPUBuffer> m_cumProbEytzingerBuf;
 
    // Shared
-   core::smart_refctd_ptr<video::IGPUBuffer> m_outputBuf;
-   video::IQueue* m_queue = nullptr;
-   video::IPhysicalDevice* m_physicalDevice = nullptr;
-   uint32_t m_dispatchGroupCount = 0;
-   uint32_t m_currentN = 0;
+   core::smart_refctd_ptr<IGPUBuffer> m_outputBuf;
+   IQueue*                            m_queue              = nullptr;
+   IPhysicalDevice*                   m_physicalDevice     = nullptr;
+   uint32_t                                  m_dispatchGroupCount = 0;
+   uint32_t                                  m_currentN           = 0;
+   uint32_t                                  m_aliasTableN        = 0;
 };
 
 #endif
diff --git a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
index 9f9854ac5..d95d7f103 100644
--- a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
+++ b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
@@ -35,14 +35,12 @@ class CSamplerBenchmark
 		m_logger = data.logger;
 		m_dispatchGroupCount = data.dispatchGroupCount;
 
-		// Command pool + 3 command buffers: benchmark (multi-submit), before/after timestamp
+		// Single cmdbuf holds [warmup dispatches][ts 0][bench dispatches][ts 1][cooldown dispatches]
+		// so the driver can pipeline adjacent dispatches and the trailing bench dispatches
+		// aren't measured in a winding-down tail.
 		m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
 		if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchmarkCmdbuf))
 			m_logger->log("CSamplerBenchmark: failed to create benchmark cmdbuf", system::ILogger::ELL_ERROR);
-		if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdbuf))
-			m_logger->log("CSamplerBenchmark: failed to create timestamp-before cmdbuf", system::ILogger::ELL_ERROR);
-		if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdbuf))
-			m_logger->log("CSamplerBenchmark: failed to create timestamp-after cmdbuf", system::ILogger::ELL_ERROR);
 
 		// Timestamp query pool (2 queries: before and after)
 		{
@@ -101,26 +99,22 @@ class CSamplerBenchmark
                m_executableReport = system::to_string(m_pipeline->getExecutableInfo());
 		}
 
-		// Allocate input buffer (host-visible, zero-filled, correctness irrelevant for benchmarking)
+		// Allocate input buffer (device-local VRAM, zero-filled via cmdFillBuffer; correctness
+		// irrelevant for benchmarking but we want deterministic input, not garbage)
 		core::smart_refctd_ptr<video::IGPUBuffer> inputBuf;
 		{
 			video::IGPUBuffer::SCreationParams bparams = {};
 			bparams.size = data.inputBufferBytes;
-			bparams.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+			bparams.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_TRANSFER_DST_BIT;
 			inputBuf = m_device->createBuffer(std::move(bparams));
 			video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuf->getMemoryReqs();
-			reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits();
+			reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits();
 			m_inputAlloc = m_device->allocate(reqs, inputBuf.get(), video::IDeviceMemoryAllocation::EMAF_NONE);
 			if (!m_inputAlloc.isValid())
 				m_logger->log("CSamplerBenchmark: failed to allocate input buffer memory", system::ILogger::ELL_ERROR);
-			if (m_inputAlloc.memory->map({ 0ull, m_inputAlloc.memory->getAllocationSize() }, video::IDeviceMemoryAllocation::EMCAF_READ))
-			{
-				std::memset(m_inputAlloc.memory->getMappedPointer(), 0, m_inputAlloc.memory->getAllocationSize());
-				m_inputAlloc.memory->unmap();
-			}
 		}
 
-		// Allocate output buffer (host-visible, GPU writes garbage, never read back)
+		// Allocate output buffer (device-local VRAM, GPU writes, never read back)
 		core::smart_refctd_ptr<video::IGPUBuffer> outputBuf;
 		{
 			video::IGPUBuffer::SCreationParams bparams = {};
@@ -128,12 +122,29 @@ class CSamplerBenchmark
 			bparams.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
 			outputBuf = m_device->createBuffer(std::move(bparams));
 			video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuf->getMemoryReqs();
-			reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits();
+			reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits();
 			m_outputAlloc = m_device->allocate(reqs, outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_NONE);
 			if (!m_outputAlloc.isValid())
 				m_logger->log("CSamplerBenchmark: failed to allocate output buffer memory", system::ILogger::ELL_ERROR);
 		}
 
+		// Zero-fill the input buffer once on the GPU
+		{
+			core::smart_refctd_ptr<video::IGPUCommandBuffer> initCmdbuf;
+			m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &initCmdbuf);
+			initCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			const asset::SBufferRange<video::IGPUBuffer> range = { .offset = 0, .size = data.inputBufferBytes, .buffer = inputBuf };
+			initCmdbuf->fillBuffer(range, 0u);
+			initCmdbuf->end();
+
+			auto queue = m_device->getQueue(data.computeFamilyIndex, 0);
+			const video::IQueue::SSubmitInfo::SCommandBufferInfo cmds[] = { {.cmdbuf = initCmdbuf.get()} };
+			video::IQueue::SSubmitInfo submit = {};
+			submit.commandBuffers = cmds;
+			queue->submit({&submit, 1u});
+			m_device->waitIdle();
+		}
+
 		// Descriptor set: bind both buffers
 		auto pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, { &dsLayout.get(), 1 });
 		m_ds = pool->createDescriptorSet(core::smart_refctd_ptr(dsLayout));
@@ -161,43 +172,36 @@ class CSamplerBenchmark
 			m_logger->log("%s Sampler Benchmark Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, name.c_str(), m_executableReport.c_str());
 	}
 
-	// Runs warmupIterations submits (unclocked), then benchmarkIterations submits under GPU timestamps.
 	void run(const std::string& samplerName, const std::string& mode, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000)
 	{
 		m_device->waitIdle();
-		recordBenchmarkCmdBuf();
-		recordTimestampCmdBufs();
-
-		auto semaphore = m_device->createSemaphore(0u);
-		uint64_t semCounter = 0u;
 
-		const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = { {.cmdbuf = m_benchmarkCmdbuf.get()} };
-		const video::IQueue::SSubmitInfo::SCommandBufferInfo beforeCmds[] = { {.cmdbuf = m_timestampBeforeCmdbuf.get()} };
-		const video::IQueue::SSubmitInfo::SCommandBufferInfo afterCmds[] = { {.cmdbuf = m_timestampAfterCmdbuf.get()} };
-
-		// Chains submissions via a timeline semaphore so they execute strictly in order
-		auto submitSerial = [&](const video::IQueue::SSubmitInfo::SCommandBufferInfo* cmds, uint32_t count)
-		{
-			const video::IQueue::SSubmitInfo::SSemaphoreInfo waitSem[] = {
-				{.semaphore = semaphore.get(), .value = semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}
-			};
-			const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = {
-				{.semaphore = semaphore.get(), .value = ++semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}
-			};
-			video::IQueue::SSubmitInfo submit = {};
-			submit.commandBuffers = {cmds, count};
-			submit.waitSemaphores = waitSem;
-			submit.signalSemaphores = signalSem;
-			m_queue->submit({&submit, 1u});
-		};
+		const uint32_t cooldownIterations = warmupIterations;
 
+		m_benchmarkCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE);
+		m_benchmarkCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		m_benchmarkCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
+		m_benchmarkCmdbuf->bindComputePipeline(m_pipeline.get());
+		m_benchmarkCmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get());
 		for (uint32_t i = 0u; i < warmupIterations; ++i)
-			submitSerial(benchCmds, 1u);
-
-		submitSerial(beforeCmds, 1u);
+			m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
+		m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0);
 		for (uint32_t i = 0u; i < benchmarkIterations; ++i)
-			submitSerial(benchCmds, 1u);
-		submitSerial(afterCmds, 1u);
+			m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
+		m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1);
+		for (uint32_t i = 0u; i < cooldownIterations; ++i)
+			m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
+		m_benchmarkCmdbuf->end();
+
+		auto semaphore = m_device->createSemaphore(0u);
+		const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = { {.cmdbuf = m_benchmarkCmdbuf.get()} };
+		const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = {
+			{.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}
+		};
+		video::IQueue::SSubmitInfo submit = {};
+		submit.commandBuffers = benchCmds;
+		submit.signalSemaphores = signalSem;
+		m_queue->submit({&submit, 1u});
 
 		m_device->waitIdle();
 
@@ -219,36 +223,10 @@ class CSamplerBenchmark
 	}
 
 private:
-	void recordBenchmarkCmdBuf()
-	{
-		m_benchmarkCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE);
-		m_benchmarkCmdbuf->begin(video::IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT);
-		m_benchmarkCmdbuf->bindComputePipeline(m_pipeline.get());
-		m_benchmarkCmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get());
-		m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
-		m_benchmarkCmdbuf->end();
-	}
-
-	void recordTimestampCmdBufs()
-	{
-		m_timestampBeforeCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE);
-		m_timestampBeforeCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-		m_timestampBeforeCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
-		m_timestampBeforeCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0);
-		m_timestampBeforeCmdbuf->end();
-
-		m_timestampAfterCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE);
-		m_timestampAfterCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-		m_timestampAfterCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1);
-		m_timestampAfterCmdbuf->end();
-	}
-
 	core::smart_refctd_ptr<video::ILogicalDevice>       m_device;
 	core::smart_refctd_ptr<system::ILogger>             m_logger;
 	core::smart_refctd_ptr<video::IGPUCommandPool>      m_cmdpool;
 	core::smart_refctd_ptr<video::IGPUCommandBuffer>    m_benchmarkCmdbuf;
-	core::smart_refctd_ptr<video::IGPUCommandBuffer>    m_timestampBeforeCmdbuf;
-	core::smart_refctd_ptr<video::IGPUCommandBuffer>    m_timestampAfterCmdbuf;
 	core::smart_refctd_ptr<video::IQueryPool>           m_queryPool;
 	core::smart_refctd_ptr<video::IGPUPipelineLayout>   m_pplnLayout;
 	core::smart_refctd_ptr<video::IGPUComputePipeline>  m_pipeline;
diff --git a/37_HLSLSamplingTests/main.cpp b/37_HLSLSamplingTests/main.cpp
index 470132aba..e0248d034 100644
--- a/37_HLSLSamplingTests/main.cpp
+++ b/37_HLSLSamplingTests/main.cpp
@@ -1,5 +1,7 @@
 #include <nabla.h>
 
+#include <utility>
+
 #include "nbl/examples/examples.hpp"
 #include "nbl/this_example/builtin/build/spirv/keys.hpp"
 
@@ -109,12 +111,19 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       // Note: all samplers almost satisfy BasicSampler, but they have cache parameters in generate().
       static_assert(sampling::concepts::BasicSampler<sampling::ConcentricMapping<float32_t>>);
       static_assert(sampling::concepts::BasicSampler<sampling::PolarMapping<float32_t>>);
-      static_assert(sampling::concepts::BasicSampler<TestAliasTable>);
-      static_assert(sampling::concepts::BasicSampler<TestCumulativeProbabilitySampler>);
+      static_assert(sampling::concepts::BasicSampler<sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, ReadOnlyAccessor<float32_t>, sampling::TRACKING>>);
+      static_assert(sampling::concepts::BasicSampler<sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, ReadOnlyAccessor<float32_t>, sampling::YOLO>>);
+      static_assert(sampling::concepts::BasicSampler<sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, ReadOnlyAccessor<float32_t>, sampling::EYTZINGER>>);
+      static_assert(sampling::concepts::BasicSampler<sampling::PackedAliasTableA<float32_t, float32_t, uint32_t, ReadOnlyAccessor<uint32_t>, ReadOnlyAccessor<float32_t>, 26>>);
+      static_assert(sampling::concepts::BasicSampler<sampling::PackedAliasTableB<float32_t, float32_t, uint32_t, ArrayAccessor<sampling::PackedAliasEntryB<float>, 4>, ReadOnlyAccessor<float32_t>, 26>>);
 
       // --- TractableSampler (level 2) --- generate(domain_type, out cache_type) -> codomain_type, forwardPdf(domain_type, cache_type) -> density_type
-      static_assert(sampling::concepts::TractableSampler<TestAliasTable>);
-      static_assert(sampling::concepts::TractableSampler<TestCumulativeProbabilitySampler>);
+      ;
+      static_assert(sampling::concepts::TractableSampler<sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, ReadOnlyAccessor<float32_t>, sampling::TRACKING>>);
+      static_assert(sampling::concepts::TractableSampler<sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, ReadOnlyAccessor<float32_t>, sampling::YOLO>>);
+      static_assert(sampling::concepts::TractableSampler<sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, ReadOnlyAccessor<float32_t>, sampling::EYTZINGER>>);
+      static_assert(sampling::concepts::TractableSampler<sampling::PackedAliasTableA<float32_t, float32_t, uint32_t, ReadOnlyAccessor<uint32_t>, ReadOnlyAccessor<float32_t>, 26>>);
+      static_assert(sampling::concepts::TractableSampler<sampling::PackedAliasTableB<float32_t, float32_t, uint32_t, ArrayAccessor<sampling::PackedAliasEntryB<float>, 4>, ReadOnlyAccessor<float32_t>, 26>>);
       static_assert(sampling::concepts::TractableSampler<sampling::Linear<float>>);
       static_assert(sampling::concepts::TractableSampler<sampling::Bilinear<float>>);
       static_assert(sampling::concepts::TractableSampler<sampling::UniformHemisphere<float>>);
@@ -130,8 +139,11 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       static_assert(sampling::concepts::TractableSampler<sampling::PolarMapping<float32_t>>);
 
       // --- ResamplableSampler (level 3, parallel) --- generate(domain_type, out cache_type) -> codomain_type, forwardWeight(domain_type, cache_type), backwardWeight(codomain_type)
-      static_assert(sampling::concepts::ResamplableSampler<TestAliasTable>);
-      static_assert(sampling::concepts::ResamplableSampler<TestCumulativeProbabilitySampler>);
+      static_assert(sampling::concepts::ResamplableSampler<sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, ReadOnlyAccessor<float32_t>, sampling::TRACKING>>);
+      static_assert(sampling::concepts::ResamplableSampler<sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, ReadOnlyAccessor<float32_t>, sampling::YOLO>>);
+      static_assert(sampling::concepts::ResamplableSampler<sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, ReadOnlyAccessor<float32_t>, sampling::EYTZINGER>>);
+      static_assert(sampling::concepts::ResamplableSampler<sampling::PackedAliasTableA<float32_t, float32_t, uint32_t, ReadOnlyAccessor<uint32_t>, ReadOnlyAccessor<float32_t>, 26>>);
+      static_assert(sampling::concepts::ResamplableSampler<sampling::PackedAliasTableB<float32_t, float32_t, uint32_t, ArrayAccessor<sampling::PackedAliasEntryB<float>, 4>, ReadOnlyAccessor<float32_t>, 26>>);
       static_assert(sampling::concepts::ResamplableSampler<sampling::Linear<float>>);
       static_assert(sampling::concepts::ResamplableSampler<sampling::Bilinear<float>>);
       static_assert(sampling::concepts::ResamplableSampler<sampling::UniformHemisphere<float>>);
@@ -179,11 +191,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       // ======================================================================
       // GPU throughput benchmarks
       // ======================================================================
-      // 4096 workgroups * WORKGROUP_SIZE(64) = 256k invocations per dispatch — enough
-      // to saturate a 3080 (68 SMs * ~1536 resident invocations) so memory latency is
-      // hidden by hyperthreading rather than by cross-dispatch overlap.
       constexpr uint32_t testBatchCount = 4096;
-      constexpr bool DoBenchmark        = true;
+      constexpr bool     DoBenchmark    = true;
 
       if constexpr (DoBenchmark)
       {
@@ -195,8 +204,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
          struct BenchEntry
          {
             CSamplerBenchmark bench;
-            std::string sampler;
-            std::string mode;
+            std::string       sampler;
+            std::string       mode;
          };
          std::vector<BenchEntry> benchmarks;
 
@@ -222,44 +231,47 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
          };
 
          // Bench shaders don't read input (hardcoded values) and write a single uint32_t per thread via RWByteAddressBuffer
-         constexpr size_t benchInputBytes  = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks
-         constexpr size_t benchOutputBytes = sizeof(uint32_t) * totalThreadsPerDispatch;
-         //addBench("Linear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("Linear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("Bilinear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("Bilinear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("BoxMullerTransform", "1:1", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("BoxMullerTransform", "1:16", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("UniformHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("UniformHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("UniformSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("UniformSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("ConcentricMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("ConcentricMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("PolarMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("PolarMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("ProjectedHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("ProjectedHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("ProjectedSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("ProjectedSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("SphericalRectangle", "1:1  (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("SphericalRectangle", "1:16 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("SphericalRectangle", "1:1  (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("SphericalRectangle", "1:16 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("SphericalRectangle", "1:1  (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("SphericalRectangle", "1:16 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("SphericalRectangle", "create-only (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("SphericalRectangle", "create-only (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("SphericalRectangle", "create-only (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("ProjectedSphericalRectangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("ProjectedSphericalRectangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("ProjectedSphericalRectangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("SphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("SphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("SphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("ProjectedSphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("ProjectedSphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
-         //addBench("ProjectedSphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes);
+         if constexpr (true)
+         {
+            constexpr size_t benchInputBytes  = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks
+            constexpr size_t benchOutputBytes = sizeof(uint32_t) * totalThreadsPerDispatch;
+            addBench("Linear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("Linear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("Bilinear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("Bilinear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("BoxMullerTransform", "1:1", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("BoxMullerTransform", "1:16", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("UniformHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("UniformHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("UniformSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("UniformSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("ConcentricMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("ConcentricMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("PolarMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("PolarMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("ProjectedHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("ProjectedHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("ProjectedSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("ProjectedSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("SphericalRectangle", "1:1  (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("SphericalRectangle", "1:16 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("SphericalRectangle", "1:1  (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("SphericalRectangle", "1:16 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("SphericalRectangle", "1:1  (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("SphericalRectangle", "1:16 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("SphericalRectangle", "create-only (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("SphericalRectangle", "create-only (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("SphericalRectangle", "create-only (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("ProjectedSphericalRectangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("ProjectedSphericalRectangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("ProjectedSphericalRectangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("SphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("SphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("SphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("ProjectedSphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("ProjectedSphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes);
+            addBench("ProjectedSphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes);
+         }
 
          // Print all pipeline reports first
          for (auto& entry : benchmarks)
@@ -268,16 +280,18 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
          // Discrete sampler benchmark: alias table vs cumulative probability (BDA)
          {
             CDiscreteSamplerBenchmark::SetupData dsData;
-            dsData.device               = m_device;
-            dsData.api                  = m_api;
-            dsData.assetMgr             = m_assetMgr;
-            dsData.logger               = m_logger;
-            dsData.physicalDevice       = m_physicalDevice;
-            dsData.computeFamilyIndex   = getComputeQueue()->getFamilyIndex();
-            dsData.aliasShaderKey       = nbl::this_example::builtin::build::get_spirv_key<"alias_table_bench">(m_device.get());
-            dsData.cumProbShaderKey     = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get());
-            dsData.cumProbYoloShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get());
-            dsData.dispatchGroupCount   = testBatchCount;
+            dsData.device                    = m_device;
+            dsData.api                       = m_api;
+            dsData.assetMgr                  = m_assetMgr;
+            dsData.logger                    = m_logger;
+            dsData.physicalDevice            = m_physicalDevice;
+            dsData.computeFamilyIndex        = getComputeQueue()->getFamilyIndex();
+            dsData.packedAliasAShaderKey     = nbl::this_example::builtin::build::get_spirv_key<"packed_alias_a_bench">(m_device.get());
+            dsData.packedAliasBShaderKey     = nbl::this_example::builtin::build::get_spirv_key<"packed_alias_b_bench">(m_device.get());
+            dsData.cumProbShaderKey          = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get());
+            dsData.cumProbYoloShaderKey      = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get());
+            dsData.cumProbEytzingerShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_eytzinger_bench">(m_device.get());
+            dsData.dispatchGroupCount        = testBatchCount;
 
             CDiscreteSamplerBenchmark discreteBench;
             discreteBench.setup(dsData);
@@ -295,41 +309,26 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
             }
 
             {
-               // Sweep covers both the YOLO-vs-Comparator comparison (explicit points at
-               // N=100, 10k, 1M for wg=WORKGROUP_SIZE) and an alias-vs-CDF ramp from
-               // N=4 up to 32M in a roughly-power-of-8 progression.
+               // If you change something here, better change kBenchTable below too
                const std::vector<uint32_t> discreteSizes = {
-                  4u,
-                  16u,
-                  32u,
-                  100u,
-                  128u,
-                  512u,
-                  8192u,
-                  10000u,
-                  131072u,
-                  1000000u,
-                  2097152u,
-                  16777216u,
-                  33554432u,
-               };
+                  2u, 4u, 8u, 16u, 32u, 64u, 100u, 128u, 256u, 400u, 512u, 1024u, 2048u, 2049u, 3000u, 4096u, 7000u, 8192u, 10'000u, 16'384u, 32'768u,
+                  65'536u, 131'072u, 262'144u, 524'288u, 1'000'000u, 1'048'576u, 2'097'152u, 16'777'216u, 20'971'520u, 25'165'824u, 33'554'432u};
 
-               // Adaptive dispatch scheduler: pick dispatch counts so total wall-clock
-               // per sampler-per-N stays near 1.5 s. Cost model comes from the prior
-               // sweep (order-of-magnitude ps/sample vs N).
+               // Per-N dispatch counts calibrated from a prior measured run
                auto dispatchScheduler = [](uint32_t N) -> CDiscreteSamplerBenchmark::DispatchCounts
                {
-                  double ps_per_sample;
-                  if      (N <     1000u)  ps_per_sample =   15.0;  // L1-resident
-                  else if (N <   100000u)  ps_per_sample =  100.0;  // L1/L2
-                  else if (N <  2000000u)  ps_per_sample = 1000.0;  // L2-edge
-                  else                     ps_per_sample = 8000.0;  // DRAM-bound
-
-                  constexpr double targetNs            = 1.5e9;                 // ~1.5 s per bench
-                  constexpr uint64_t samplesPerDispatch = uint64_t(WORKGROUP_SIZE) * uint64_t(testBatchCount) * uint64_t(BENCH_ITERS);
-                  const uint64_t targetSamples          = uint64_t((targetNs * 1000.0) / ps_per_sample);
-                  const uint32_t bench                  = std::max(10u, uint32_t(targetSamples / samplesPerDispatch));
-                  const uint32_t warmup                 = std::max(20u, bench / 10u);
+                  static constexpr std::pair<uint32_t, uint32_t> kBenchTable[] = {
+                     {2u, 7180u}, {4u, 5993u}, {8u, 4490u}, {16u, 4099u}, {32u, 3110u}, {64u, 3026u}, {100u, 2507u}, {128u, 2498u}, {256u, 2477u}, {400u, 2001u},
+                     {512u, 1827u}, {1024u, 1372u}, {2048u, 1010u}, {2049u, 1010u}, {3000u, 859u}, {4096u, 962u}, {7000u, 742u}, {8192u, 833u}, {10'000u, 590u}, {16'384u, 786u}, {32'768u, 608u},
+                     {65'536u, 283u}, {131'072u, 174u}, {262'144u, 160u}, {524'288u, 133u}, {1'000'000u, 77u}, {1'048'576u, 128u}, {2'097'152u, 106u}, {16'777'216u, 17u}, {20'971'520u, 17u}, {25'165'824u, 16u}, {33'554'432u, 14u}};
+                  uint32_t bench = 10u; // fallback for any N not in the table
+                  for (const auto& e : kBenchTable)
+                     if (e.first == N)
+                     {
+                        bench = e.second;
+                        break;
+                     }
+                  const uint32_t warmup = std::max(5u, bench / 10u);
                   return {warmup, bench};
                };
 
@@ -354,8 +353,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
          data.logger             = m_logger;
          data.physicalDevice     = m_physicalDevice;
          data.computeFamilyIndex = getComputeQueue()->getFamilyIndex();
-         data.shaderKey          = spirvKey;
-         Tester tester(testBatchCount, WORKGROUP_SIZE);
+         data.shaderKey          = std::move(spirvKey);
+         Tester tester(testBatchCount);
          tester.setupPipeline(data);
          pass &= tester.performTestsAndVerifyResults(logFile);
       };
@@ -388,7 +387,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
          }
 
          // --- GPU table sampler tests ---
-         runSamplerTest.operator()<CAliasTableGPUTester>("AliasTable GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"alias_table_test">(m_device.get()), "AliasTableTestLog.txt");
+         runSamplerTest.operator()<CPackedAliasAGPUTester>("PackedAliasA GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"packed_alias_a_test">(m_device.get()), "PackedAliasATestLog.txt");
+         runSamplerTest.operator()<CPackedAliasBGPUTester>("PackedAliasB GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"packed_alias_b_test">(m_device.get()), "PackedAliasBTestLog.txt");
          runSamplerTest.operator()<CCumulativeProbabilityGPUTester>("CumulativeProbability GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_test">(m_device.get()), "CumulativeProbabilityTestLog.txt");
       }
       logJacobianSkipCounts(m_logger.get());
@@ -470,6 +470,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat
       // ================================================================
       // Solid angle accuracy and small triangle convergence tests (CPU-only)
       // ================================================================
+      if constexpr (true)
       {
          m_logger->log("Running geometry tests (CPU)...", ILogger::ELL_INFO);
          m_logger->log("WARNING: CPU math may use higher intermediate precision than GPU shaders. Tolerances that pass here may be too tight for GPU.", ILogger::ELL_WARNING);
diff --git a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h
index 32f0e3b28..7665ebbb7 100644
--- a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h
+++ b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h
@@ -6,13 +6,31 @@
 #include "nbl/examples/Tester/ITester.h"
 #include "SamplerTestHelpers.h"
 
-class CAliasTableGPUTester final : public ITester<AliasTableInputValues, AliasTableTestResults, AliasTableTestExecutor>
+// Shared GPU correctness harness for the packed alias variants. Labels for
+// failed-field messages are selected from the Executor type at compile time.
+template<typename Executor>
+class CPackedAliasTableGPUTester final : public ITester<AliasTableInputValues, AliasTableTestResults, Executor>
 {
-	using base_t = ITester<AliasTableInputValues, AliasTableTestResults, AliasTableTestExecutor>;
-	using R = AliasTableTestResults;
+	using base_t = ITester<AliasTableInputValues, AliasTableTestResults, Executor>;
+	using R      = AliasTableTestResults;
+
+	using typename base_t::TestType;
+	using base_t::getRandomEngine;
+	using base_t::verifyTestValue;
+	using base_t::printTestFail;
+
+	static constexpr bool kIsA = std::is_same_v<Executor, PackedAliasATestExecutor>;
+	static constexpr const char* kGeneratedIdxName     = kIsA ? "PackedAliasA::generatedIndex"     : "PackedAliasB::generatedIndex";
+	static constexpr const char* kForwardPdfName       = kIsA ? "PackedAliasA::forwardPdf"         : "PackedAliasB::forwardPdf";
+	static constexpr const char* kBackwardPdfName      = kIsA ? "PackedAliasA::backwardPdf"        : "PackedAliasB::backwardPdf";
+	static constexpr const char* kForwardWeightName    = kIsA ? "PackedAliasA::forwardWeight"      : "PackedAliasB::forwardWeight";
+	static constexpr const char* kBackwardWeightName   = kIsA ? "PackedAliasA::backwardWeight"     : "PackedAliasB::backwardWeight";
+	static constexpr const char* kJacobianName         = kIsA ? "PackedAliasA::jacobianProduct"    : "PackedAliasB::jacobianProduct";
+	static constexpr const char* kPdfConsistencyName   = kIsA ? "PackedAliasA::pdf consistency"    : "PackedAliasB::pdf consistency";
+	static constexpr const char* kWeightConsistencyName = kIsA ? "PackedAliasA::weight consistency" : "PackedAliasB::weight consistency";
 
 public:
-	CAliasTableGPUTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CPackedAliasTableGPUTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	AliasTableInputValues generateInputTestValues() override
@@ -27,7 +45,7 @@ class CAliasTableGPUTester final : public ITester<AliasTableInputValues, AliasTa
 	AliasTableTestResults determineExpectedResults(const AliasTableInputValues& input) override
 	{
 		AliasTableTestResults expected;
-		AliasTableTestExecutor executor;
+		Executor              executor;
 		executor(input, expected);
 		return expected;
 	}
@@ -39,25 +57,27 @@ class CAliasTableGPUTester final : public ITester<AliasTableInputValues, AliasTa
 		if (expected.generatedIndex != actual.generatedIndex)
 		{
 			pass = false;
-			printTestFail("AliasTable::generatedIndex", float(expected.generatedIndex), float(actual.generatedIndex), iteration, seed, testType, 0.0, 0.0);
+			printTestFail(kGeneratedIdxName, float(expected.generatedIndex), float(actual.generatedIndex), iteration, seed, testType, 0.0, 0.0);
 		}
 
 		VERIFY_FIELDS(pass, expected, actual, iteration, seed, testType,
-			FieldCheck{"AliasTable::forwardPdf",     &R::forwardPdf,     1e-5, 1e-6},
-			FieldCheck{"AliasTable::backwardPdf",    &R::backwardPdf,    1e-5, 1e-6},
-			FieldCheck{"AliasTable::forwardWeight",  &R::forwardWeight,  1e-5, 1e-6},
-			FieldCheck{"AliasTable::backwardWeight", &R::backwardWeight, 1e-5, 1e-6});
+			FieldCheck{kForwardPdfName,     &R::forwardPdf,     1e-5, 1e-6},
+			FieldCheck{kBackwardPdfName,    &R::backwardPdf,    1e-5, 1e-6},
+			FieldCheck{kForwardWeightName,  &R::forwardWeight,  1e-5, 1e-6},
+			FieldCheck{kBackwardWeightName, &R::backwardWeight, 1e-5, 1e-6});
 		VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType,
-			PdfCheck{"AliasTable::forwardPdf",  &R::forwardPdf},
-			PdfCheck{"AliasTable::backwardPdf", &R::backwardPdf});
+			PdfCheck{kForwardPdfName,  &R::forwardPdf},
+			PdfCheck{kBackwardPdfName, &R::backwardPdf});
 
-		// Structural invariants
-		pass &= verifyTestValue("AliasTable::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 1e-4, 1e-4);
-		pass &= verifyTestValue("AliasTable::pdf consistency", actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-7, 1e-7);
-		pass &= verifyTestValue("AliasTable::weight consistency", actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-7, 1e-7);
+		pass &= verifyTestValue(kJacobianName,          1.0f, actual.jacobianProduct, iteration, seed, testType, 1e-4, 1e-4);
+		pass &= verifyTestValue(kPdfConsistencyName,    actual.forwardPdf, actual.backwardPdf, iteration, seed, testType, 1e-7, 1e-7);
+		pass &= verifyTestValue(kWeightConsistencyName, actual.forwardWeight, actual.backwardWeight, iteration, seed, testType, 1e-7, 1e-7);
 
 		return pass;
 	}
 };
 
+using CPackedAliasAGPUTester = CPackedAliasTableGPUTester<PackedAliasATestExecutor>;
+using CPackedAliasBGPUTester = CPackedAliasTableGPUTester<PackedAliasBTestExecutor>;
+
 #endif
diff --git a/37_HLSLSamplingTests/tests/CBilinearTester.h b/37_HLSLSamplingTests/tests/CBilinearTester.h
index 739af4584..f5bea6896 100644
--- a/37_HLSLSamplingTests/tests/CBilinearTester.h
+++ b/37_HLSLSamplingTests/tests/CBilinearTester.h
@@ -14,7 +14,7 @@ class CBilinearTester final : public ITester<BilinearInputValues, BilinearTestRe
 	using R = BilinearTestResults;
 
 public:
-	CBilinearTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CBilinearTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	BilinearInputValues generateInputTestValues() override
diff --git a/37_HLSLSamplingTests/tests/CBoxMullerTransformTester.h b/37_HLSLSamplingTests/tests/CBoxMullerTransformTester.h
index 29539a72b..183a11d44 100644
--- a/37_HLSLSamplingTests/tests/CBoxMullerTransformTester.h
+++ b/37_HLSLSamplingTests/tests/CBoxMullerTransformTester.h
@@ -14,7 +14,7 @@ class CBoxMullerTransformTester final : public ITester<BoxMullerTransformInputVa
 	using R = BoxMullerTransformTestResults;
 
 public:
-	CBoxMullerTransformTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CBoxMullerTransformTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	BoxMullerTransformInputValues generateInputTestValues() override
diff --git a/37_HLSLSamplingTests/tests/CConcentricMappingTester.h b/37_HLSLSamplingTests/tests/CConcentricMappingTester.h
index 3496e250d..30b363107 100644
--- a/37_HLSLSamplingTests/tests/CConcentricMappingTester.h
+++ b/37_HLSLSamplingTests/tests/CConcentricMappingTester.h
@@ -14,7 +14,7 @@ class CConcentricMappingTester final : public ITester<ConcentricMappingInputValu
 	using R = ConcentricMappingTestResults;
 
 public:
-	CConcentricMappingTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CConcentricMappingTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	ConcentricMappingInputValues generateInputTestValues() override
diff --git a/37_HLSLSamplingTests/tests/CCumulativeProbabilityGPUTester.h b/37_HLSLSamplingTests/tests/CCumulativeProbabilityGPUTester.h
index 0d86d873b..45448d3e2 100644
--- a/37_HLSLSamplingTests/tests/CCumulativeProbabilityGPUTester.h
+++ b/37_HLSLSamplingTests/tests/CCumulativeProbabilityGPUTester.h
@@ -12,7 +12,7 @@ class CCumulativeProbabilityGPUTester final : public ITester<CumProbInputValues,
 	using R = CumProbTestResults;
 
 public:
-	CCumulativeProbabilityGPUTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CCumulativeProbabilityGPUTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	CumProbInputValues generateInputTestValues() override
diff --git a/37_HLSLSamplingTests/tests/CDiscreteTableTester.h b/37_HLSLSamplingTests/tests/CDiscreteTableTester.h
index 26e8685bb..c4e2a08c1 100644
--- a/37_HLSLSamplingTests/tests/CDiscreteTableTester.h
+++ b/37_HLSLSamplingTests/tests/CDiscreteTableTester.h
@@ -8,255 +8,389 @@
 #include <vector>
 #include <random>
 #include <cmath>
+#include <algorithm>
 
 // Generic ReadOnly accessor wrapping a raw pointer
 template<typename T>
+   requires std::is_arithmetic_v<T>
 struct ReadOnlyAccessor
 {
-	using value_type = T;
-	template<typename V, std::integral I> requires std::is_arithmetic_v<V>
-	void get(I i, V& val) const { val = V(data[i]); }
-	T operator[](uint32_t i) const { return data[i]; }
+   using value_type = T;
+   template<typename V, std::integral I>
+      requires std::is_arithmetic_v<V>
+   void get(I i, V& val) const { val = V(data[i]); }
 
-	const T* data;
+   const T* data;
 };
 
-using ProbabilityAccessor = ReadOnlyAccessor<float32_t>;
-using AliasIndexAccessor = ReadOnlyAccessor<uint32_t>;
-using PdfAccessor = ReadOnlyAccessor<float>;
-
-using TestAliasTable = nbl::hlsl::sampling::AliasTable<float32_t, float32_t, uint32_t, ProbabilityAccessor, AliasIndexAccessor, PdfAccessor>;
-using TestCumulativeProbabilitySampler = nbl::hlsl::sampling::CumulativeProbabilitySampler<float32_t, float32_t, uint32_t, ReadOnlyAccessor<float32_t>>;
-
 // Tests table construction for both alias method and cumulative probability.
 // Sampler generate/pdf correctness is verified by GPU testers (CAliasTableGPUTester, CCumulativeProbabilityGPUTester).
 class CDiscreteTableTester
 {
-public:
-	CDiscreteTableTester(system::ILogger* logger) : m_logger(logger) {}
-
-	bool run()
-	{
-		bool pass = true;
-		auto cases = createTestCases();
-
-		m_logger->log("AliasTableBuilder tests:", system::ILogger::ELL_INFO);
-		for (const auto& tc : cases)
-			pass &= testAliasTable(tc.name, tc.weights);
-
-		m_logger->log("CumulativeProbability tests:", system::ILogger::ELL_INFO);
-		for (const auto& tc : cases)
-			pass &= testCumulativeProbability(tc.name, tc.weights);
-
-		return pass;
-	}
-
-private:
-	struct TestCase
-	{
-		const char* name;
-		std::vector<float> weights;
-	};
-
-	static std::vector<TestCase> createTestCases()
-	{
-		std::vector<TestCase> cases;
-		cases.push_back({"Uniform(4)", {1.0f, 1.0f, 1.0f, 1.0f}});
-		cases.push_back({"NonUniform(1,2,3,4)", {1.0f, 2.0f, 3.0f, 4.0f}});
-
-		{
-			std::vector<float> w(32, 1.0f);
-			w[31] = 97.0f;
-			cases.push_back({"SingleDominant(32)", std::move(w)});
-		}
-		{
-			std::vector<float> w(64);
-			for (uint32_t i = 0; i < 64; i++)
-				w[i] = 1.0f / float(i + 1);
-			cases.push_back({"PowerLaw(64)", std::move(w)});
-		}
-
-		cases.push_back({"SingleNonZero(4)", {0.0f, 0.0f, 5.0f, 0.0f}});
-
-		{
-			std::vector<float> w(1024);
-			std::mt19937 rng(42);
-			std::uniform_real_distribution<float> dist(0.001f, 100.0f);
-			for (uint32_t i = 0; i < 1024; i++)
-				w[i] = dist(rng);
-			cases.push_back({"Random(1024)", std::move(w)});
-		}
-
-		return cases;
-	}
-
-	// Verify all values in array are in [0, 1]
-	bool verifyRange01(const char* prefix, const char* name, const char* arrayName, const float* data, uint32_t count) const
-	{
-		bool pass = true;
-		for (uint32_t i = 0; i < count; i++)
-		{
-			if (data[i] < 0.0f || data[i] > 1.0f + 1e-6f)
-			{
-				m_logger->log("%s[%s] %s[%u] = %f out of range [0, 1]",
-					system::ILogger::ELL_ERROR, prefix, name, arrayName, i, data[i]);
-				pass = false;
-			}
-		}
-		return pass;
-	}
-
-	// Shared: verify PDFs sum to 1 and each matches weight/totalWeight
-	bool verifyPdf(const char* prefix, const char* name, const float* pdf, const std::vector<float>& weights) const
-	{
-		const uint32_t N = static_cast<uint32_t>(weights.size());
-		float totalWeight = 0.0f;
-		for (uint32_t i = 0; i < N; i++)
-			totalWeight += weights[i];
-
-		bool pass = true;
-
-		float pdfSum = 0.0f;
-		for (uint32_t i = 0; i < N; i++)
-			pdfSum += pdf[i];
-
-		if (std::abs(pdfSum - 1.0f) > 1e-5f)
-		{
-			m_logger->log("%s[%s] PDF sum: expected 1.0, got %f", system::ILogger::ELL_ERROR, prefix, name, pdfSum);
-			pass = false;
-		}
-
-		for (uint32_t i = 0; i < N; i++)
-		{
-			const float expected = weights[i] / totalWeight;
-			const float err = std::abs(expected - pdf[i]);
-			if (err > 1e-6f)
-			{
-				m_logger->log("%s[%s] pdf[%u]: expected %f, got %f (err=%e)", system::ILogger::ELL_ERROR, prefix, name, i, expected, pdf[i], err);
-				pass = false;
-			}
-		}
-
-		return pass;
-	}
-
-	// Verify alias table builder output:
-	//   - bucket contributions reconstruct correct probabilities
-	//   - PDFs sum to 1 and match weight/totalWeight
-	//   - alias indices in range, probabilities in [0, 1]
-	bool testAliasTable(const char* name, const std::vector<float>& weights) const
-	{
-		const uint32_t N = static_cast<uint32_t>(weights.size());
-
-		std::vector<float> outProbability(N);
-		std::vector<uint32_t> outAlias(N);
-		std::vector<float> outPdf(N);
-		std::vector<uint32_t> workspace(N);
-
-		nbl::hlsl::sampling::AliasTableBuilder<float>::build({ weights },outProbability.data(), outAlias.data(), outPdf.data(), workspace.data());
-
-		// Accumulate bucket contributions
-		std::vector<float> dest(N, 0.0f);
-		for (uint32_t i = 0; i < N; i++)
-		{
-			dest[i] += outProbability[i];
-			dest[outAlias[i]] += (1.0f - outProbability[i]);
-		}
-
-		bool pass = true;
-
-		float totalWeight = 0.0f;
-		for (uint32_t i = 0; i < N; i++)
-			totalWeight += weights[i];
-
-		for (uint32_t i = 0; i < N; i++)
-		{
-			const float expected = weights[i] / totalWeight * float(N);
-			const float err = std::abs(expected - dest[i]);
-			const float tolerance = std::max(1e-5f * float(N), 1e-4f);
-
-			if (err > tolerance)
-			{
-				m_logger->log("AliasTable[%s] bucket %u: expected %f, got %f (err=%e)",
-					system::ILogger::ELL_ERROR, name, i, expected, dest[i], err);
-				pass = false;
-			}
-		}
-
-		// Alias indices in range
-		for (uint32_t i = 0; i < N; i++)
-		{
-			if (outAlias[i] >= N)
-			{
-				m_logger->log("AliasTable[%s] alias[%u] = %u out of range [0, %u)",
-					system::ILogger::ELL_ERROR, name, i, outAlias[i], N);
-				pass = false;
-			}
-		}
-
-		pass &= verifyPdf("AliasTable", name, outPdf.data(), weights);
-		pass &= verifyRange01("AliasTable", name, "probability", outProbability.data(), N);
-
-		if (pass)
-			m_logger->log("  [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name);
-
-		return pass;
-	}
-
-	// Verify CDF table construction:
-	//   - cumulative probabilities are monotonically non-decreasing
-	//   - PDFs match weight/totalWeight
-	//   - PDFs sum to 1
-	bool testCumulativeProbability(const char* name, const std::vector<float>& weights) const
-	{
-		const uint32_t N = static_cast<uint32_t>(weights.size());
-
-		std::vector<float> cumProb(N - 1);
-
-		nbl::hlsl::sampling::computeNormalizedCumulativeHistogram<float>(
-			std::span<const float>(weights),
-			cumProb.data());
-
-		bool pass = true;
-
-		// Monotonically non-decreasing
-		for (uint32_t i = 1; i < N - 1; i++)
-		{
-			if (cumProb[i] < cumProb[i - 1] - 1e-7f)
-			{
-				m_logger->log("CumProb[%s] non-monotonic at %u: cumProb[%u]=%f < cumProb[%u]=%f",
-					system::ILogger::ELL_ERROR, name, i, i, cumProb[i], i - 1, cumProb[i - 1]);
-				pass = false;
-			}
-		}
-
-		// Last stored entry should be < 1.0 (the Nth bucket is implicitly 1.0)
-		if (N > 1 && cumProb[N - 2] >= 1.0f + 1e-6f)
-		{
-			m_logger->log("CumProb[%s] last stored entry %f >= 1.0",
-				system::ILogger::ELL_ERROR, name, cumProb[N - 2]);
-			pass = false;
-		}
-
-		// Derive PDF from CDF for verification
-		std::vector<float> pdf(N);
-		for (uint32_t i = 0; i < N; i++)
-		{
-			const float cur = (i < N - 1) ? cumProb[i] : 1.0f;
-			const float prev = (i > 0) ? cumProb[i - 1] : 0.0f;
-			pdf[i] = cur - prev;
-		}
-
-		pass &= verifyPdf("CumProb", name, pdf.data(), weights);
-		pass &= verifyRange01("CumProb", name, "cumProb", cumProb.data(), N - 1);
-
-		if (pass)
-			m_logger->log("  [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name);
-
-		return pass;
-	}
-
-	system::ILogger* m_logger;
+   public:
+   CDiscreteTableTester(system::ILogger* logger) : m_logger(logger) {}
+
+   bool run()
+   {
+      bool pass  = true;
+      auto cases = createTestCases();
+
+      m_logger->log("AliasTableBuilder tests:", system::ILogger::ELL_INFO);
+      for (const auto& tc : cases)
+         pass &= testAliasTable(tc.name, tc.weights);
+
+      m_logger->log("CumulativeProbability tests:", system::ILogger::ELL_INFO);
+      for (const auto& tc : cases)
+         pass &= testCumulativeProbability(tc.name, tc.weights);
+
+      m_logger->log("CumulativeProbabilitySampler tests (TRACKING / YOLO / EYTZINGER):", system::ILogger::ELL_INFO);
+      for (const auto& tc : cases)
+         pass &= testSamplers(tc.name, tc.weights);
+
+      return pass;
+   }
+
+   private:
+   struct TestCase
+   {
+      const char*        name;
+      std::vector<float> weights;
+   };
+
+   static std::vector<TestCase> createTestCases()
+   {
+      std::vector<TestCase> cases;
+      cases.push_back({"Uniform(4)", {1.0f, 1.0f, 1.0f, 1.0f}});
+      cases.push_back({"NonUniform(1,2,3,4)", {1.0f, 2.0f, 3.0f, 4.0f}});
+
+      {
+         std::vector<float> w(32, 1.0f);
+         w[31] = 97.0f;
+         cases.push_back({"SingleDominant(32)", std::move(w)});
+      }
+      {
+         std::vector<float> w(64);
+         for (uint32_t i = 0; i < 64; i++)
+            w[i] = 1.0f / float(i + 1);
+         cases.push_back({"PowerLaw(64)", std::move(w)});
+      }
+
+      cases.push_back({"SingleNonZero(4)", {0.0f, 0.0f, 5.0f, 0.0f}});
+
+      {
+         std::vector<float>                    w(1024);
+         std::mt19937                          rng(42);
+         std::uniform_real_distribution<float> dist(0.001f, 100.0f);
+         for (uint32_t i = 0; i < 1024; i++)
+            w[i] = dist(rng);
+         cases.push_back({"Random(1024)", std::move(w)});
+      }
+
+      // NPoT cases exercise EYTZINGER padded-leaf territory (P > N).
+      cases.push_back({"NonPot(7)", {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}});
+      {
+         std::vector<float>                    w(1000);
+         std::mt19937                          rng(4242);
+         std::uniform_real_distribution<float> dist(0.001f, 100.0f);
+         for (uint32_t i = 0; i < 1000; i++)
+            w[i] = dist(rng);
+         cases.push_back({"Random(1000)", std::move(w)});
+      }
+
+      return cases;
+   }
+
+   // Verify all values in array are in [0, 1]
+   bool verifyRange01(const char* prefix, const char* name, const char* arrayName, const float* data, uint32_t count) const
+   {
+      bool pass = true;
+      for (uint32_t i = 0; i < count; i++)
+      {
+         if (data[i] < 0.0f || data[i] > 1.0f + 1e-6f)
+         {
+            m_logger->log("%s[%s] %s[%u] = %f out of range [0, 1]",
+               system::ILogger::ELL_ERROR, prefix, name, arrayName, i, data[i]);
+            pass = false;
+         }
+      }
+      return pass;
+   }
+
+   // Shared: verify PDFs sum to 1 and each matches weight/totalWeight
+   bool verifyPdf(const char* prefix, const char* name, const float* pdf, const std::vector<float>& weights) const
+   {
+      const uint32_t N           = static_cast<uint32_t>(weights.size());
+      float          totalWeight = 0.0f;
+      for (uint32_t i = 0; i < N; i++)
+         totalWeight += weights[i];
+
+      bool pass = true;
+
+      float pdfSum = 0.0f;
+      for (uint32_t i = 0; i < N; i++)
+         pdfSum += pdf[i];
+
+      if (std::abs(pdfSum - 1.0f) > 1e-5f)
+      {
+         m_logger->log("%s[%s] PDF sum: expected 1.0, got %f", system::ILogger::ELL_ERROR, prefix, name, pdfSum);
+         pass = false;
+      }
+
+      for (uint32_t i = 0; i < N; i++)
+      {
+         const float expected = weights[i] / totalWeight;
+         const float err      = std::abs(expected - pdf[i]);
+         if (err > 1e-6f)
+         {
+            m_logger->log("%s[%s] pdf[%u]: expected %f, got %f (err=%e)", system::ILogger::ELL_ERROR, prefix, name, i, expected, pdf[i], err);
+            pass = false;
+         }
+      }
+
+      return pass;
+   }
+
+   // Verify alias table builder output:
+   //   - bucket contributions reconstruct correct scaled probabilities
+   //   - PDFs sum to 1 and match weight/totalWeight
+   //   - alias indices in range, probabilities in [0, 1]
+   // Builder transparently pads PoT N to N+1; actual table size comes back
+   // as `tableN` and is what gets compared against.
+   bool testAliasTable(const char* name, const std::vector<float>& weights) const
+   {
+      const uint32_t userN = static_cast<uint32_t>(weights.size());
+
+      std::vector<float>    outProbability;
+      std::vector<uint32_t> outAlias;
+      std::vector<float>    outPdf;
+      const uint32_t        tableN = nbl::hlsl::sampling::AliasTableBuilder<float>::build({weights}, outProbability, outAlias, outPdf);
+
+      // Accumulate bucket contributions over the full (possibly padded) table
+      std::vector<float> dest(tableN, 0.0f);
+      for (uint32_t i = 0; i < tableN; i++)
+      {
+         dest[i] += outProbability[i];
+         dest[outAlias[i]] += (1.0f - outProbability[i]);
+      }
+
+      bool pass = true;
+
+      float totalWeight = 0.0f;
+      for (uint32_t i = 0; i < userN; i++)
+         totalWeight += weights[i];
+
+      // Real buckets: expected scaled prob = weight/total * tableN
+      for (uint32_t i = 0; i < userN; i++)
+      {
+         const float expected  = weights[i] / totalWeight * float(tableN);
+         const float err       = std::abs(expected - dest[i]);
+         const float tolerance = std::max(1e-5f * float(tableN), 1e-4f);
+
+         if (err > tolerance)
+         {
+            m_logger->log("AliasTable[%s] bucket %u: expected %f, got %f (err=%e)",
+               system::ILogger::ELL_ERROR, name, i, expected, dest[i], err);
+            pass = false;
+         }
+      }
+
+      // Dummy bucket (only when padded): no real bucket aliases to it -> dest[userN] should be 0.
+      if (tableN != userN && std::abs(dest[userN]) > 1e-4f)
+      {
+         m_logger->log("AliasTable[%s] dummy bucket %u has non-zero reconstructed probability %f",
+            system::ILogger::ELL_ERROR, name, userN, dest[userN]);
+         pass = false;
+      }
+
+      // Alias indices in range [0, tableN)
+      for (uint32_t i = 0; i < tableN; i++)
+      {
+         if (outAlias[i] >= tableN)
+         {
+            m_logger->log("AliasTable[%s] alias[%u] = %u out of range [0, %u)",
+               system::ILogger::ELL_ERROR, name, i, outAlias[i], tableN);
+            pass = false;
+         }
+      }
+
+      pass &= verifyPdf("AliasTable", name, outPdf.data(), weights);
+      pass &= verifyRange01("AliasTable", name, "probability", outProbability.data(), tableN);
+
+      if (pass)
+         m_logger->log("  [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name);
+
+      return pass;
+   }
+
+   // Verify CDF table construction: monotonicity, implicit-1.0 invariant, and
+   // stored entries in [0, 1]. PDF-from-CDF correctness is covered by the
+   // TRACKING sampler test below (same cdf[i] - cdf[i-1] derivation via
+   // sampler.backwardPdf), so it's not repeated here.
+   bool testCumulativeProbability(const char* name, const std::vector<float>& weights) const
+   {
+      const uint32_t N = static_cast<uint32_t>(weights.size());
+
+      std::vector<float> cumProb(N - 1);
+
+      nbl::hlsl::sampling::computeNormalizedCumulativeHistogram<float>(std::span<const float>(weights), cumProb.data());
+
+      bool pass = true;
+
+      // Monotonically non-decreasing
+      for (uint32_t i = 1; i < N - 1; i++)
+      {
+         if (cumProb[i] < cumProb[i - 1] - 1e-7f)
+         {
+            m_logger->log("CumProb[%s] non-monotonic at %u: cumProb[%u]=%f < cumProb[%u]=%f",
+               system::ILogger::ELL_ERROR, name, i, i, cumProb[i], i - 1, cumProb[i - 1]);
+            pass = false;
+         }
+      }
+
+      // Last stored entry should be < 1.0 (the Nth bucket is implicitly 1.0)
+      if (N > 1 && cumProb[N - 2] >= 1.0f + 1e-6f)
+      {
+         m_logger->log("CumProb[%s] last stored entry %f >= 1.0", system::ILogger::ELL_ERROR, name, cumProb[N - 2]);
+         pass = false;
+      }
+
+      pass &= verifyRange01("CumProb", name, "cumProb", cumProb.data(), N - 1);
+
+      if (pass)
+         m_logger->log("  [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name);
+
+      return pass;
+   }
+
+   // Reference binary search over the full N-entry CDF (last entry == 1.0).
+   static uint32_t referenceUpperBound(const std::vector<float>& fullCdf, float u)
+   {
+      auto it = std::upper_bound(fullCdf.begin(), fullCdf.end(), u);
+      return static_cast<uint32_t>(std::distance(fullCdf.begin(), it));
+   }
+
+   // Run TRACKING, YOLO, and EYTZINGER samplers against the same reference
+   // distribution. Each mode is instantiated via the dual-compile sampler and
+   // exercised entirely on the CPU.
+   bool testSamplers(const char* name, const std::vector<float>& weights) const
+   {
+      const uint32_t N = static_cast<uint32_t>(weights.size());
+      if (N < 2)
+         return true;
+
+      float totalWeight = 0.0f;
+      for (uint32_t i = 0; i < N; i++)
+         totalWeight += weights[i];
+      const float rcpTotal = 1.0f / totalWeight;
+
+      std::vector<float> pdfRef(N);
+      std::vector<float> fullCdf(N);
+      float              acc = 0.0f;
+      for (uint32_t i = 0; i < N; i++)
+      {
+         pdfRef[i] = weights[i] * rcpTotal;
+         acc += pdfRef[i];
+         fullCdf[i] = acc;
+      }
+      fullCdf[N - 1] = 1.0f; // pin the last entry; reference must treat it as exact
+
+      // Storage for TRACKING / YOLO (N-1 entries, last bucket implicit at 1.0).
+      std::vector<float> cdfStorage(N - 1);
+      nbl::hlsl::sampling::computeNormalizedCumulativeHistogram<float>({weights}, cdfStorage.data());
+
+      // Storage for EYTZINGER (2*P entries, level-order implicit binary tree).
+      const uint32_t     P = nbl::hlsl::sampling::eytzingerLeafCount(N);
+      std::vector<float> treeStorage(2u * P, 0.0f);
+      nbl::hlsl::sampling::buildEytzinger<float>({weights}, treeStorage.data());
+
+      bool pass = true;
+      pass &= testSamplerMode<nbl::hlsl::sampling::CumulativeProbabilityMode::TRACKING>("TRACKING", name, N, pdfRef, fullCdf, cdfStorage.data());
+      pass &= testSamplerMode<nbl::hlsl::sampling::CumulativeProbabilityMode::YOLO>("YOLO", name, N, pdfRef, fullCdf, cdfStorage.data());
+      pass &= testSamplerMode<nbl::hlsl::sampling::CumulativeProbabilityMode::EYTZINGER>("EYTZINGER", name, N, pdfRef, fullCdf, treeStorage.data());
+      return pass;
+   }
+
+   template<nbl::hlsl::sampling::CumulativeProbabilityMode Mode>
+   bool testSamplerMode(const char* modeName, const char* caseName, uint32_t N,
+      const std::vector<float>& pdfRef, const std::vector<float>& fullCdf, const float* accessorData) const
+   {
+      using Sampler = nbl::hlsl::sampling::CumulativeProbabilitySampler<
+         float, float, uint32_t, ReadOnlyAccessor<float>, Mode>;
+
+      ReadOnlyAccessor<float> accessor {accessorData};
+      Sampler                 sampler = Sampler::create(accessor, N);
+
+      bool pass = true;
+
+      // backwardPdf(v) == pdfRef[v], and the implied PDF sums to 1.
+      float backwardSum = 0.0f;
+      for (uint32_t v = 0; v < N; v++)
+      {
+         const float got      = sampler.backwardPdf(v);
+         const float expected = pdfRef[v];
+         const float err      = std::abs(got - expected);
+         const float tol      = 1e-5f;
+         if (err > tol)
+         {
+            m_logger->log("Sampler[%s][%s] backwardPdf[%u]: expected %e, got %e (err=%e)",
+               system::ILogger::ELL_ERROR, modeName, caseName, v, expected, got, err);
+            pass = false;
+         }
+         backwardSum += got;
+      }
+      if (std::abs(backwardSum - 1.0f) > 1e-5f)
+      {
+         m_logger->log("Sampler[%s][%s] backwardPdf sum: expected 1.0, got %f",
+            system::ILogger::ELL_ERROR, modeName, caseName, backwardSum);
+         pass = false;
+      }
+
+      // generate(u) lands in the correct bucket for a grid of u values, and
+      // generate(u, cache) produces forwardPdf matching backwardPdf(result).
+      std::mt19937                          rng(1234u + N);
+      std::uniform_real_distribution<float> udist(0.0f, std::nextafter(1.0f, 0.0f));
+      constexpr uint32_t                    kTrials = 2048;
+
+      for (uint32_t k = 0; k < kTrials; k++)
+      {
+         const float    u   = udist(rng);
+         const uint32_t ref = referenceUpperBound(fullCdf, u);
+
+         const uint32_t idx = sampler.generate(u);
+         if (idx != ref)
+         {
+            m_logger->log("Sampler[%s][%s] generate(%.7f): expected bucket %u, got %u",
+               system::ILogger::ELL_ERROR, modeName, caseName, u, ref, idx);
+            pass = false;
+            continue;
+         }
+
+         typename Sampler::cache_type cache;
+         const uint32_t               idxCache = sampler.generate(u, cache);
+         if (idxCache != ref)
+         {
+            m_logger->log("Sampler[%s][%s] generate(u,cache)(%.7f): expected %u, got %u",
+               system::ILogger::ELL_ERROR, modeName, caseName, u, ref, idxCache);
+            pass = false;
+            continue;
+         }
+
+         const float forwardP  = sampler.forwardPdf(u, cache);
+         const float backwardP = sampler.backwardPdf(idxCache);
+         if (std::abs(forwardP - backwardP) > 1e-6f)
+         {
+            m_logger->log("Sampler[%s][%s] fwd/bwd pdf mismatch at u=%.7f bucket=%u: fwd=%e bwd=%e",
+               system::ILogger::ELL_ERROR, modeName, caseName, u, idxCache, forwardP, backwardP);
+            pass = false;
+         }
+      }
+
+      if (pass)
+         m_logger->log("  [%-9s %s] PASSED", system::ILogger::ELL_PERFORMANCE, modeName, caseName);
+      return pass;
+   }
+
+   system::ILogger* m_logger;
 };
 
 #endif
diff --git a/37_HLSLSamplingTests/tests/CLinearTester.h b/37_HLSLSamplingTests/tests/CLinearTester.h
index 814fbb1d7..394b68721 100644
--- a/37_HLSLSamplingTests/tests/CLinearTester.h
+++ b/37_HLSLSamplingTests/tests/CLinearTester.h
@@ -14,7 +14,7 @@ class CLinearTester final : public ITester<LinearInputValues, LinearTestResults,
 	using R = LinearTestResults;
 
 public:
-	CLinearTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CLinearTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	LinearInputValues generateInputTestValues() override
diff --git a/37_HLSLSamplingTests/tests/CPolarMappingTester.h b/37_HLSLSamplingTests/tests/CPolarMappingTester.h
index 6c43f8877..13971e186 100644
--- a/37_HLSLSamplingTests/tests/CPolarMappingTester.h
+++ b/37_HLSLSamplingTests/tests/CPolarMappingTester.h
@@ -14,7 +14,7 @@ class CPolarMappingTester final : public ITester<PolarMappingInputValues, PolarM
 	using R = PolarMappingTestResults;
 
 public:
-	CPolarMappingTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CPolarMappingTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	PolarMappingInputValues generateInputTestValues() override
diff --git a/37_HLSLSamplingTests/tests/CProjectedHemisphereTester.h b/37_HLSLSamplingTests/tests/CProjectedHemisphereTester.h
index 7d62368f4..3a3e0e96e 100644
--- a/37_HLSLSamplingTests/tests/CProjectedHemisphereTester.h
+++ b/37_HLSLSamplingTests/tests/CProjectedHemisphereTester.h
@@ -14,7 +14,7 @@ class CProjectedHemisphereTester final : public ITester<ProjectedHemisphereInput
 	using R = ProjectedHemisphereTestResults;
 
 public:
-	CProjectedHemisphereTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CProjectedHemisphereTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	ProjectedHemisphereInputValues generateInputTestValues() override
diff --git a/37_HLSLSamplingTests/tests/CProjectedSphereTester.h b/37_HLSLSamplingTests/tests/CProjectedSphereTester.h
index 4095333f7..f3b026ab2 100644
--- a/37_HLSLSamplingTests/tests/CProjectedSphereTester.h
+++ b/37_HLSLSamplingTests/tests/CProjectedSphereTester.h
@@ -14,7 +14,7 @@ class CProjectedSphereTester final : public ITester<ProjectedSphereInputValues,
 	using R = ProjectedSphereTestResults;
 
 public:
-	CProjectedSphereTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CProjectedSphereTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	ProjectedSphereInputValues generateInputTestValues() override
diff --git a/37_HLSLSamplingTests/tests/CProjectedSphericalRectangleTester.h b/37_HLSLSamplingTests/tests/CProjectedSphericalRectangleTester.h
index 21137d5eb..28025293b 100644
--- a/37_HLSLSamplingTests/tests/CProjectedSphericalRectangleTester.h
+++ b/37_HLSLSamplingTests/tests/CProjectedSphericalRectangleTester.h
@@ -15,7 +15,7 @@ class CProjectedSphericalRectangleTester final : public ITester<ProjectedSpheric
    using R = ProjectedSphericalRectangleTestResults;
 
    public:
-   CProjectedSphericalRectangleTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+   CProjectedSphericalRectangleTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
    private:
    ProjectedSphericalRectangleInputValues generateInputTestValues() override
diff --git a/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h b/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h
index 0460a30ee..611fa1f3c 100644
--- a/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h
+++ b/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h
@@ -14,7 +14,7 @@ class CProjectedSphericalTriangleTester final : public ITester<ProjectedSpherica
 	using R = ProjectedSphericalTriangleTestResults;
 
 public:
-	CProjectedSphericalTriangleTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CProjectedSphericalTriangleTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	ProjectedSphericalTriangleInputValues generateInputTestValues() override
diff --git a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h
index fa5c93ccb..bc74f6415 100644
--- a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h
+++ b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h
@@ -15,7 +15,7 @@ class CSphericalRectangleTester final : public ITester<SphericalRectangleInputVa
 	using R = SphericalRectangleTestResults;
 
 public:
-	CSphericalRectangleTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CSphericalRectangleTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	SphericalRectangleInputValues generateInputTestValues() override
diff --git a/37_HLSLSamplingTests/tests/CSphericalTriangleTester.h b/37_HLSLSamplingTests/tests/CSphericalTriangleTester.h
index e1c68acc1..68dd2310b 100644
--- a/37_HLSLSamplingTests/tests/CSphericalTriangleTester.h
+++ b/37_HLSLSamplingTests/tests/CSphericalTriangleTester.h
@@ -14,7 +14,7 @@ class CSphericalTriangleTester final : public ITester<SphericalTriangleInputValu
 	using R = SphericalTriangleTestResults;
 
 public:
-	CSphericalTriangleTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CSphericalTriangleTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	SphericalTriangleInputValues generateInputTestValues() override
diff --git a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h
index 4f2ae08a4..b07cee739 100644
--- a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h
+++ b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h
@@ -12,7 +12,7 @@ class CUniformHemisphereTester final : public ITester<UniformHemisphereInputValu
 	using R = UniformHemisphereTestResults;
 
 public:
-	CUniformHemisphereTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CUniformHemisphereTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	UniformHemisphereInputValues generateInputTestValues() override
diff --git a/37_HLSLSamplingTests/tests/CUniformSphereTester.h b/37_HLSLSamplingTests/tests/CUniformSphereTester.h
index 4cca917e7..34f9ad3e9 100644
--- a/37_HLSLSamplingTests/tests/CUniformSphereTester.h
+++ b/37_HLSLSamplingTests/tests/CUniformSphereTester.h
@@ -12,7 +12,7 @@ class CUniformSphereTester final : public ITester<UniformSphereInputValues, Unif
 	using R = UniformSphereTestResults;
 
 public:
-	CUniformSphereTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {}
+	CUniformSphereTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {}
 
 private:
 	UniformSphereInputValues generateInputTestValues() override
diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp
index 7919f68c5..8329c03b0 100644
--- a/64_EmulatedFloatTest/main.cpp
+++ b/64_EmulatedFloatTest/main.cpp
@@ -931,13 +931,8 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso
             // setting up pipeline in the constructor
             m_queueFamily = base.getComputeQueue()->getFamilyIndex();
             m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-            //core::smart_refctd_ptr<IGPUCommandBuffer>* cmdBuffs[] = { &m_cmdbuf, &m_timestampBeforeCmdBuff, &m_timestampAfterCmdBuff };
             if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf))
                 base.logFail("Failed to create Command Buffers!\n");
-            if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdBuff))
-                base.logFail("Failed to create Command Buffers!\n");
-            if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdBuff))
-                base.logFail("Failed to create Command Buffers!\n");
 
             // Load shaders, set up pipeline
             {
@@ -1024,6 +1019,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso
                     dummyBuff->setObjectDebugName("benchmark buffer");
 
                     nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = dummyBuff->getMemoryReqs();
+                    reqs.memoryTypeBits &= base.m_physicalDevice->getDeviceLocalMemoryTypeBits();
 
                     m_allocation = base.m_device->allocate(reqs, dummyBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE);
                     if (!m_allocation.isValid())
@@ -1075,104 +1071,51 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso
         {
             m_device->waitIdle();
 
-            recordTimestampQueryCmdBuffers();
-
-            uint64_t semaphoreCounter = 0;
-            smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(semaphoreCounter);
-
-            IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} };
-            IQueue::SSubmitInfo::SSemaphoreInfo waits[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT } };
-
-            IQueue::SSubmitInfo beforeTimestapSubmitInfo[1] = {};
-            const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsBegin[] = { {.cmdbuf = m_timestampBeforeCmdBuff.get()} };
-            beforeTimestapSubmitInfo[0].commandBuffers = cmdbufsBegin;
-            beforeTimestapSubmitInfo[0].signalSemaphores = signals;
-            beforeTimestapSubmitInfo[0].waitSemaphores = waits;
-
-            IQueue::SSubmitInfo afterTimestapSubmitInfo[1] = {};
-            const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsEnd[] = { {.cmdbuf = m_timestampAfterCmdBuff.get()} };
-            afterTimestapSubmitInfo[0].commandBuffers = cmdbufsEnd;
-            afterTimestapSubmitInfo[0].signalSemaphores = signals;
-            afterTimestapSubmitInfo[0].waitSemaphores = waits;
-
-            IQueue::SSubmitInfo benchmarkSubmitInfos[1] = {};
-            const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} };
-            benchmarkSubmitInfos[0].commandBuffers = cmdbufs;
-            benchmarkSubmitInfos[0].signalSemaphores = signals;
-            benchmarkSubmitInfos[0].waitSemaphores = waits;
-
-
             m_pushConstants.benchmarkMode = mode;
-            recordCmdBuff();
 
-            // warmup runs
-            for (int i = 0; i < WarmupIterations; ++i)
-            {
-                if(i == 0)
-                    m_api->startCapture();
-                waits[0].value = semaphoreCounter;
-                signals[0].value = ++semaphoreCounter;
-                m_computeQueue->submit(benchmarkSubmitInfos);
-                if (i == 0)
-                    m_api->endCapture();
-            }
-
-            waits[0].value = semaphoreCounter;
-            signals[0].value = ++semaphoreCounter;
-            m_computeQueue->submit(beforeTimestapSubmitInfo);
-
-            // actual benchmark runs
-            for (int i = 0; i < Iterations; ++i)
-            {
-                waits[0].value = semaphoreCounter;
-                signals[0].value = ++semaphoreCounter;
-                m_computeQueue->submit(benchmarkSubmitInfos);
-            }
-            
-            waits[0].value = semaphoreCounter;
-            signals[0].value = ++semaphoreCounter;
-            m_computeQueue->submit(afterTimestapSubmitInfo);
-
-            m_device->waitIdle();
+            // [warmup dispatches][ts 0][bench dispatches][ts 1][cooldown dispatches] in one cmdbuf,
+            // one submit. Per-submit semaphore chaining adds sync cost and blocks driver pipelining;
+            // the cooldown keeps the GPU in steady state across ts 1 so the trailing bench
+            // dispatches don't land in a winding-down tail.
+            constexpr int CooldownIterations = WarmupIterations;
 
-            const uint64_t nativeBenchmarkTimeElapsedNanoseconds = calcTimeElapsed();
-            const float nativeBenchmarkTimeElapsedSeconds = double(nativeBenchmarkTimeElapsedNanoseconds) / 1000000000.0;
-
-            m_logger->log("%llu ns, %f s", ILogger::ELL_PERFORMANCE, nativeBenchmarkTimeElapsedNanoseconds, nativeBenchmarkTimeElapsedSeconds);
-        }
-
-        void recordCmdBuff()
-        {
-            m_cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT);
+            m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
+            m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
             m_cmdbuf->beginDebugMarker("emulated_float64_t compute dispatch", vectorSIMDf(0, 1, 0, 1));
+            m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
             m_cmdbuf->bindComputePipeline(m_pipeline.get());
             m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get());
             m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants);
-            m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1);
+            for (int i = 0; i < WarmupIterations; ++i)
+                m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1);
+            m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0);
+            for (int i = 0; i < Iterations; ++i)
+                m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1);
+            m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1);
+            for (int i = 0; i < CooldownIterations; ++i)
+                m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1);
             m_cmdbuf->endDebugMarker();
             m_cmdbuf->end();
-        }
 
-        void recordTimestampQueryCmdBuffers()
-        {
-            static bool firstInvocation = true;
+            smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0u);
+            const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} };
+            const IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = {
+                {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}
+            };
+            IQueue::SSubmitInfo submit = {};
+            submit.commandBuffers = cmdbufs;
+            submit.signalSemaphores = signalSem;
 
-            if (!firstInvocation)
-            {
-                m_timestampBeforeCmdBuff->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
-                m_timestampBeforeCmdBuff->reset(IGPUCommandBuffer::RESET_FLAGS::NONE);
-            }
+            m_api->startCapture();
+            m_computeQueue->submit({&submit, 1u});
+            m_api->endCapture();
 
-            m_timestampBeforeCmdBuff->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-            m_timestampBeforeCmdBuff->resetQueryPool(m_queryPool.get(), 0, 2);
-            m_timestampBeforeCmdBuff->writeTimestamp(PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0);
-            m_timestampBeforeCmdBuff->end();
+            m_device->waitIdle();
 
-            m_timestampAfterCmdBuff->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-            m_timestampAfterCmdBuff->writeTimestamp(PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1);
-            m_timestampAfterCmdBuff->end();
+            const uint64_t nativeBenchmarkTimeElapsedNanoseconds = calcTimeElapsed();
+            const float nativeBenchmarkTimeElapsedSeconds = double(nativeBenchmarkTimeElapsedNanoseconds) / 1000000000.0;
 
-            firstInvocation = false;
+            m_logger->log("%llu ns, %f s", ILogger::ELL_PERFORMANCE, nativeBenchmarkTimeElapsedNanoseconds, nativeBenchmarkTimeElapsedSeconds);
         }
 
         uint64_t calcTimeElapsed()
@@ -1196,8 +1139,6 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso
         BenchmarkPushConstants m_pushConstants;
         smart_refctd_ptr<nbl::video::IGPUComputePipeline> m_pipeline;
 
-        smart_refctd_ptr<nbl::video::IGPUCommandBuffer> m_timestampBeforeCmdBuff = nullptr;
-        smart_refctd_ptr<nbl::video::IGPUCommandBuffer> m_timestampAfterCmdBuff = nullptr;
         smart_refctd_ptr<nbl::video::IQueryPool> m_queryPool = nullptr;
 
         uint32_t m_queueFamily;

From 23d6c4c83c2898e8d6fb8329fbd266cbb07ce144 Mon Sep 17 00:00:00 2001
From: Karim Mohamed <karimsayedre@gmail.com>
Date: Wed, 13 May 2026 09:43:35 +0300
Subject: [PATCH 5/5] sampler bench BDA push constants + spherical rect tests

- pipeline layout declares the PC range, output buf is BDA-allocated, PC pushed before the dispatch loop
- spherical rectangle: new tests for generateNormalizedLocal / generateUnnormalized / computeHitT, bug in computeHitT
- drop redundant pdf field in tests
---
 37_HLSLSamplingTests/CMakeLists.txt           |  1 +
 .../common/sampler_bench_pc.hlsl              | 15 ++++++++++++
 .../common/spherical_rectangle.hlsl           | 18 ++++++++++++++
 .../common/uniform_hemisphere.hlsl            |  1 -
 .../app_resources/common/uniform_sphere.hlsl  |  1 -
 .../shaders/bilinear_test.comp.hlsl           |  5 ++--
 .../box_muller_transform_test.comp.hlsl       |  5 ++--
 .../shaders/concentric_mapping_test.comp.hlsl |  5 ++--
 .../shaders/linear_test.comp.hlsl             |  5 ++--
 .../shaders/polar_mapping_test.comp.hlsl      |  5 ++--
 .../projected_hemisphere_test.comp.hlsl       |  5 ++--
 .../shaders/projected_sphere_test.comp.hlsl   |  5 ++--
 ...ojected_spherical_rectangle_test.comp.hlsl |  5 ++--
 ...rojected_spherical_triangle_test.comp.hlsl |  5 ++--
 .../spherical_rectangle_test.comp.hlsl        |  5 ++--
 .../shaders/spherical_triangle.comp.hlsl      |  5 ++--
 .../shaders/uniform_hemisphere_test.comp.hlsl |  5 ++--
 .../shaders/uniform_sphere_test.comp.hlsl     |  5 ++--
 .../benchmarks/CSamplerBenchmark.h            | 20 ++++++++++++----
 .../tests/CSphericalRectangleTester.h         | 24 +++++++++++++++++++
 .../tests/CUniformHemisphereTester.h          |  1 -
 .../tests/CUniformSphereTester.h              |  1 -
 22 files changed, 113 insertions(+), 34 deletions(-)
 create mode 100644 37_HLSLSamplingTests/app_resources/common/sampler_bench_pc.hlsl

diff --git a/37_HLSLSamplingTests/CMakeLists.txt b/37_HLSLSamplingTests/CMakeLists.txt
index e50fe4663..78e3ab319 100644
--- a/37_HLSLSamplingTests/CMakeLists.txt
+++ b/37_HLSLSamplingTests/CMakeLists.txt
@@ -42,6 +42,7 @@ set(DEPENDS
   app_resources/common/concentric_mapping.hlsl
   app_resources/common/polar_mapping.hlsl
   app_resources/common/discrete_sampler_bench.hlsl
+  app_resources/common/sampler_bench_pc.hlsl
   app_resources/common/alias_table.hlsl
   app_resources/common/cumulative_probability.hlsl
 )
diff --git a/37_HLSLSamplingTests/app_resources/common/sampler_bench_pc.hlsl b/37_HLSLSamplingTests/app_resources/common/sampler_bench_pc.hlsl
new file mode 100644
index 000000000..ab357e504
--- /dev/null
+++ b/37_HLSLSamplingTests/app_resources/common/sampler_bench_pc.hlsl
@@ -0,0 +1,15 @@
+#ifndef _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_SAMPLER_BENCH_PC_INCLUDED_
+#define _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_SAMPLER_BENCH_PC_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+
+// Implicit-output benchmark push constants. Every sampler bench shader writes
+// one uint32_t accumulator per thread to outputAddress[invID]; nothing reads it
+// back -- the goal is to keep the optimiser from eliding the sampling work.
+// Mirrors the BDA convention from discrete_sampler_bench.hlsl.
+struct SamplerBenchPushConstants
+{
+	uint64_t outputAddress;
+};
+
+#endif
diff --git a/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl b/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl
index 4f8d20964..68159405a 100644
--- a/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl
@@ -22,6 +22,11 @@ struct SphericalRectangleTestResults
 	float32_t3 generated;
 	float32_t2 surfaceOffset;
 	float32_t3 referenceDirection;
+	float32_t3 normalizedLocal;
+	float32_t  hitDist;
+	float32_t3 unnormalized;
+	float32_t  computedHitT;
+	float32_t3 normalizedLocalToWorld;
 	float32_t forwardPdf;
 	float32_t backwardPdf;
 	float32_t forwardWeight;
@@ -61,6 +66,19 @@ struct SphericalRectangleTestExecutor
 			                          + sampler.basis[1] * localDir[1]
 			                          + sampler.basis[2] * localDir[2];
 		}
+		{
+			sampling::SphericalRectangle<float32_t>::cache_type cache;
+			output.normalizedLocal = sampler.generateNormalizedLocal(input.u, cache, output.hitDist);
+			output.normalizedLocalToWorld = sampler.basis[0] * output.normalizedLocal[0]
+			                              + sampler.basis[1] * output.normalizedLocal[1]
+			                              + sampler.basis[2] * output.normalizedLocal[2];
+		}
+		{
+			sampling::SphericalRectangle<float32_t>::cache_type cache;
+			output.unnormalized = sampler.generateUnnormalized(input.u, cache);
+		}
+		output.computedHitT = sampler.computeHitT(output.generated);
+
 		output.backwardPdf = sampler.backwardPdf(output.generated);
 		output.backwardWeight = sampler.backwardWeight(output.generated);
 		// marginFactor = 3: __generate's sin_au denominator goes through catastrophic cancellation
diff --git a/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl b/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl
index fb51838c7..8541bef19 100644
--- a/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl
@@ -15,7 +15,6 @@ struct UniformHemisphereInputValues
 struct UniformHemisphereTestResults
 {
 	float32_t3 generated;
-	float32_t pdf;
 	float32_t2 inverted;
 	float32_t forwardPdf;
 	float32_t backwardPdf;
diff --git a/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl b/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl
index 3737f4575..fb4086e44 100644
--- a/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl
+++ b/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl
@@ -15,7 +15,6 @@ struct UniformSphereInputValues
 struct UniformSphereTestResults
 {
 	float32_t3 generated;
-	float32_t pdf;
 	float32_t2 inverted;
 	float32_t forwardPdf;
 	float32_t backwardPdf;
diff --git a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl
index 438eea31e..420cbcd0b 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl
@@ -5,7 +5,8 @@
 #include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
 
 #ifdef BENCH_ITERS
-[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput;
+#include "../common/sampler_bench_pc.hlsl"
+[[vk::push_constant]] SamplerBenchPushConstants benchPC;
 #else
 [[vk::binding(0, 0)]] RWStructuredBuffer<BilinearInputValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<BilinearTestResults> outputTestValues;
@@ -39,7 +40,7 @@ void main()
 			acc ^= asuint(sampler.forwardPdf(u, cache));
 		}
 	}
-	benchOutput.Store(invID * 4u, acc);
+	vk::RawBufferStore<uint32_t>(benchPC.outputAddress + invID * 4u, acc);
 #else
 	BilinearTestExecutor executor;
 	executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl
index 1fb5f6644..3302db2e9 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl
@@ -5,7 +5,8 @@
 #include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
 
 #ifdef BENCH_ITERS
-[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput;
+#include "../common/sampler_bench_pc.hlsl"
+[[vk::push_constant]] SamplerBenchPushConstants benchPC;
 #else
 [[vk::binding(0, 0)]] RWStructuredBuffer<BoxMullerTransformInputValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<BoxMullerTransformTestResults> outputTestValues;
@@ -39,7 +40,7 @@ void main()
 			acc ^= asuint(sampler.forwardPdf(u, cache));
 		}
 	}
-	benchOutput.Store(invID * 4u, acc);
+	vk::RawBufferStore<uint32_t>(benchPC.outputAddress + invID * 4u, acc);
 #else
 	BoxMullerTransformTestExecutor executor;
 	executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl
index 2a7f1861e..058c3ef11 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl
@@ -5,7 +5,8 @@
 #include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
 
 #ifdef BENCH_ITERS
-[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput;
+#include "../common/sampler_bench_pc.hlsl"
+[[vk::push_constant]] SamplerBenchPushConstants benchPC;
 #else
 [[vk::binding(0, 0)]] RWStructuredBuffer<ConcentricMappingInputValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<ConcentricMappingTestResults> outputTestValues;
@@ -35,7 +36,7 @@ void main()
 			acc ^= asuint(sampling::ConcentricMapping<float32_t>::forwardPdf(generated, cache));
 		}
 	}
-	benchOutput.Store(invID * 4u, acc);
+	vk::RawBufferStore<uint32_t>(benchPC.outputAddress + invID * 4u, acc);
 #else
 	ConcentricMappingTestExecutor executor;
 	executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl
index 7b97645b5..acf0887e5 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl
@@ -5,7 +5,8 @@
 #include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
 
 #ifdef BENCH_ITERS
-[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput;
+#include "../common/sampler_bench_pc.hlsl"
+[[vk::push_constant]] SamplerBenchPushConstants benchPC;
 #else
 [[vk::binding(0, 0)]] RWStructuredBuffer<LinearInputValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<LinearTestResults> outputTestValues;
@@ -39,7 +40,7 @@ void main()
 			acc ^= asuint(sampler.forwardPdf(u, cache));
 		}
 	}
-	benchOutput.Store(invID * 4u, acc);
+	vk::RawBufferStore<uint32_t>(benchPC.outputAddress + invID * 4u, acc);
 #else
 	LinearTestExecutor executor;
 	executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl
index b5d48cc36..b12b276e3 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl
@@ -5,7 +5,8 @@
 #include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
 
 #ifdef BENCH_ITERS
-[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput;
+#include "../common/sampler_bench_pc.hlsl"
+[[vk::push_constant]] SamplerBenchPushConstants benchPC;
 #else
 [[vk::binding(0, 0)]] RWStructuredBuffer<PolarMappingInputValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<PolarMappingTestResults> outputTestValues;
@@ -35,7 +36,7 @@ void main()
 			acc ^= asuint(sampling::PolarMapping<float32_t>::forwardPdf(generated, cache));
 		}
 	}
-	benchOutput.Store(invID * 4u, acc);
+	vk::RawBufferStore<uint32_t>(benchPC.outputAddress + invID * 4u, acc);
 #else
 	PolarMappingTestExecutor executor;
 	executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl
index f543d6dc2..9be02b9fd 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl
@@ -5,7 +5,8 @@
 #include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
 
 #ifdef BENCH_ITERS
-[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput;
+#include "../common/sampler_bench_pc.hlsl"
+[[vk::push_constant]] SamplerBenchPushConstants benchPC;
 #else
 [[vk::binding(0, 0)]] RWStructuredBuffer<ProjectedHemisphereInputValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<ProjectedHemisphereTestResults> outputTestValues;
@@ -36,7 +37,7 @@ void main()
 			acc ^= asuint(sampler.forwardPdf(u, cache));
 		}
 	}
-	benchOutput.Store(invID * 4u, acc);
+	vk::RawBufferStore<uint32_t>(benchPC.outputAddress + invID * 4u, acc);
 #else
 	ProjectedHemisphereTestExecutor executor;
 	executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl
index ca4e7eef7..7488dc2d5 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl
@@ -5,7 +5,8 @@
 #include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
 
 #ifdef BENCH_ITERS
-[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput;
+#include "../common/sampler_bench_pc.hlsl"
+[[vk::push_constant]] SamplerBenchPushConstants benchPC;
 #else
 [[vk::binding(0, 0)]] RWStructuredBuffer<ProjectedSphereInputValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<ProjectedSphereTestResults> outputTestValues;
@@ -36,7 +37,7 @@ void main()
 			acc ^= asuint(sampler.forwardPdf(u, cache));
 		}
 	}
-	benchOutput.Store(invID * 4u, acc);
+	vk::RawBufferStore<uint32_t>(benchPC.outputAddress + invID * 4u, acc);
 #else
 	ProjectedSphereTestExecutor executor;
 	executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl
index fc4ae03b7..dd7f62db4 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl
@@ -5,7 +5,8 @@
 #include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
 
 #ifdef BENCH_ITERS
-[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput;
+#include "../common/sampler_bench_pc.hlsl"
+[[vk::push_constant]] SamplerBenchPushConstants benchPC;
 #else
 [[vk::binding(0, 0)]] RWStructuredBuffer<ProjectedSphericalRectangleInputValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<ProjectedSphericalRectangleTestResults> outputTestValues;
@@ -66,7 +67,7 @@ void main()
       }
    }
 #endif
-   benchOutput.Store(invID * 4u, acc);
+   vk::RawBufferStore<uint32_t>(benchPC.outputAddress + invID * 4u, acc);
 #else
    ProjectedSphericalRectangleTestExecutor executor;
    executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl
index e32251ed8..9ed69291a 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl
@@ -5,7 +5,8 @@
 #include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
 
 #ifdef BENCH_ITERS
-[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput;
+#include "../common/sampler_bench_pc.hlsl"
+[[vk::push_constant]] SamplerBenchPushConstants benchPC;
 #else
 [[vk::binding(0, 0)]] RWStructuredBuffer<ProjectedSphericalTriangleInputValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<ProjectedSphericalTriangleTestResults> outputTestValues;
@@ -54,7 +55,7 @@ void main()
 		}
 	}
 #endif
-	benchOutput.Store(invID * 4u, acc);
+	vk::RawBufferStore<uint32_t>(benchPC.outputAddress + invID * 4u, acc);
 #else
 	ProjectedSphericalTriangleTestExecutor executor;
 	executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl
index 542d20587..8cba7fbcb 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl
@@ -5,7 +5,8 @@
 #include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
 
 #ifdef BENCH_ITERS
-[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput;
+#include "../common/sampler_bench_pc.hlsl"
+[[vk::push_constant]] SamplerBenchPushConstants benchPC;
 #else
 [[vk::binding(0, 0)]] RWStructuredBuffer<SphericalRectangleInputValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<SphericalRectangleTestResults> outputTestValues;
@@ -112,7 +113,7 @@ void main()
       }
    }
 #endif
-   benchOutput.Store(invID * 4u, acc);
+   vk::RawBufferStore<uint32_t>(benchPC.outputAddress + invID * 4u, acc);
 #else
    SphericalRectangleTestExecutor executor;
    executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl
index bc55facbd..14b4843b9 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl
@@ -5,7 +5,8 @@
 #include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
 
 #ifdef BENCH_ITERS
-[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput;
+#include "../common/sampler_bench_pc.hlsl"
+[[vk::push_constant]] SamplerBenchPushConstants benchPC;
 #else
 [[vk::binding(0, 0)]] RWStructuredBuffer<SphericalTriangleInputValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<SphericalTriangleTestResults> outputTestValues;
@@ -53,7 +54,7 @@ void main()
 		}
 	}
 #endif
-	benchOutput.Store(invID * 4u, acc);
+	vk::RawBufferStore<uint32_t>(benchPC.outputAddress + invID * 4u, acc);
 #else
 	SphericalTriangleTestExecutor executor;
 	executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl
index c0a0e58b2..50901e481 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl
@@ -5,7 +5,8 @@
 #include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
 
 #ifdef BENCH_ITERS
-[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput;
+#include "../common/sampler_bench_pc.hlsl"
+[[vk::push_constant]] SamplerBenchPushConstants benchPC;
 #else
 [[vk::binding(0, 0)]] RWStructuredBuffer<UniformHemisphereInputValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<UniformHemisphereTestResults> outputTestValues;
@@ -36,7 +37,7 @@ void main()
 			acc ^= asuint(sampler.forwardPdf(u, cache));
 		}
 	}
-	benchOutput.Store(invID * 4u, acc);
+	vk::RawBufferStore<uint32_t>(benchPC.outputAddress + invID * 4u, acc);
 #else
 	UniformHemisphereTestExecutor executor;
 	executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl
index 1c810afbf..0351e358f 100644
--- a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl
+++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl
@@ -5,7 +5,8 @@
 #include <nbl/builtin/hlsl/random/xoroshiro.hlsl>
 
 #ifdef BENCH_ITERS
-[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput;
+#include "../common/sampler_bench_pc.hlsl"
+[[vk::push_constant]] SamplerBenchPushConstants benchPC;
 #else
 [[vk::binding(0, 0)]] RWStructuredBuffer<UniformSphereInputValues> inputTestValues;
 [[vk::binding(1, 0)]] RWStructuredBuffer<UniformSphereTestResults> outputTestValues;
@@ -36,7 +37,7 @@ void main()
 			acc ^= asuint(sampler.forwardPdf(u, cache));
 		}
 	}
-	benchOutput.Store(invID * 4u, acc);
+	vk::RawBufferStore<uint32_t>(benchPC.outputAddress + invID * 4u, acc);
 #else
 	UniformSphereTestExecutor executor;
 	executor(inputTestValues[invID], outputTestValues[invID]);
diff --git a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
index d95d7f103..4f63c6fde 100644
--- a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
+++ b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
@@ -7,6 +7,7 @@
 
 #include <nabla.h>
 #include "nbl/examples/examples.hpp"
+#include "../app_resources/common/sampler_bench_pc.hlsl"
 
 using namespace nbl;
 
@@ -81,7 +82,12 @@ class CSamplerBenchmark
 		};
 		auto dsLayout = m_device->createDescriptorSetLayout(bindings);
 
-		m_pplnLayout = m_device->createPipelineLayout({}, core::smart_refctd_ptr(dsLayout));
+		const asset::SPushConstantRange pcRange = {
+			.stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE,
+			.offset     = 0,
+			.size       = sizeof(SamplerBenchPushConstants),
+		};
+		m_pplnLayout = m_device->createPipelineLayout({&pcRange, 1}, core::smart_refctd_ptr(dsLayout));
 
 		{
 			video::IGPUComputePipeline::SCreationParams pparams = {};
@@ -119,13 +125,14 @@ class CSamplerBenchmark
 		{
 			video::IGPUBuffer::SCreationParams bparams = {};
 			bparams.size = data.outputBufferBytes;
-			bparams.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+			bparams.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 			outputBuf = m_device->createBuffer(std::move(bparams));
 			video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuf->getMemoryReqs();
 			reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits();
-			m_outputAlloc = m_device->allocate(reqs, outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_NONE);
+			m_outputAlloc = m_device->allocate(reqs, outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
 			if (!m_outputAlloc.isValid())
 				m_logger->log("CSamplerBenchmark: failed to allocate output buffer memory", system::ILogger::ELL_ERROR);
+			m_outputAddress = outputBuf->getDeviceAddress();
 		}
 
 		// Zero-fill the input buffer once on the GPU
@@ -183,6 +190,10 @@ class CSamplerBenchmark
 		m_benchmarkCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2);
 		m_benchmarkCmdbuf->bindComputePipeline(m_pipeline.get());
 		m_benchmarkCmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get());
+		{
+			SamplerBenchPushConstants pc = { .outputAddress = m_outputAddress };
+			m_benchmarkCmdbuf->pushConstants(m_pplnLayout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc);
+		}
 		for (uint32_t i = 0u; i < warmupIterations; ++i)
 			m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1);
 		m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0);
@@ -233,6 +244,7 @@ class CSamplerBenchmark
 	core::smart_refctd_ptr<video::IGPUDescriptorSet>    m_ds;
 	video::IDeviceMemoryAllocator::SAllocation          m_inputAlloc  = {};
 	video::IDeviceMemoryAllocator::SAllocation          m_outputAlloc = {};
+	uint64_t                                            m_outputAddress      = 0;
 	video::IQueue*                                      m_queue              = nullptr;
 	video::IPhysicalDevice*                             m_physicalDevice     = nullptr;
 	uint32_t                                            m_dispatchGroupCount = 0;
diff --git a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h
index bc74f6415..7aabc48ec 100644
--- a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h
+++ b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h
@@ -55,6 +55,10 @@ class CSphericalRectangleTester final : public ITester<SphericalRectangleInputVa
 		VERIFY_FIELDS(pass, expected, actual, iteration, seed, testType,
 			FieldCheck{"SphericalRectangle::generate",              &R::generated,      5e-4, 2e-2},
 			FieldCheck{"SphericalRectangle::generateSurfaceOffset", &R::surfaceOffset,  5e-4, 2e-2},
+			FieldCheck{"SphericalRectangle::generateNormalizedLocal", &R::normalizedLocal, 5e-4, 2e-2},
+			FieldCheck{"SphericalRectangle::generateNormalizedLocal::hitDist", &R::hitDist, 5e-4, 2e-2},
+			FieldCheck{"SphericalRectangle::generateUnnormalized",  &R::unnormalized,   5e-4, 2e-2},
+			FieldCheck{"SphericalRectangle::computeHitT",           &R::computedHitT,   5e-4, 2e-2},
 			FieldCheck{"SphericalRectangle::forwardPdf",            &R::forwardPdf,     2e-3, 1e-1},
 			FieldCheck{"SphericalRectangle::backwardPdf",           &R::backwardPdf,    2e-3, 1e-1},
 			FieldCheck{"SphericalRectangle::forwardWeight",         &R::forwardWeight,  2e-3, 1e-1},
@@ -83,6 +87,26 @@ class CSphericalRectangleTester final : public ITester<SphericalRectangleInputVa
 		// generate must agree with generateSurfaceOffset (reference direction from normalized local point)
 		pass &= verifyTestValue("SphericalRectangle::generate vs generateSurfaceOffset", actual.generated, actual.referenceDirection, iteration, seed, testType, 5e-5, 5e-3);
 
+		// generateNormalizedLocal: must be unit length (in local frame)
+		{
+			const float localLen = nbl::hlsl::length(actual.normalizedLocal);
+			pass &= verifyTestValue("SphericalRectangle::generateNormalizedLocal (unit length)", localLen, 1.0f, iteration, seed, testType, 1e-5, 1e-4);
+		}
+		// generateNormalizedLocal transformed to world must equal generate()
+		pass &= verifyTestValue("SphericalRectangle::generateNormalizedLocal -> world == generate", actual.generated, actual.normalizedLocalToWorld, iteration, seed, testType, 5e-5, 5e-3);
+		// computeHitT(generated) must equal hitDist returned by generateNormalizedLocal
+		pass &= verifyTestValue("SphericalRectangle::computeHitT == hitDist", actual.computedHitT, actual.hitDist, iteration, seed, testType, 5e-4, 2e-2);
+		// generateUnnormalized direction must be parallel to generate() (cross product near zero)
+		{
+			const nbl::hlsl::float32_t3 c = nbl::hlsl::cross(actual.unnormalized, actual.generated);
+			pass &= verifyTestValue("SphericalRectangle::generateUnnormalized parallel to generate", c, nbl::hlsl::float32_t3(0.0f, 0.0f, 0.0f), iteration, seed, testType, 1e-3, 5e-2);
+		}
+		// |generateUnnormalized| must equal hitDist (distance to hitpoint along the unit ray)
+		{
+			const float ulen = nbl::hlsl::length(actual.unnormalized);
+			pass &= verifyTestValue("SphericalRectangle::|generateUnnormalized| == hitDist", ulen, actual.hitDist, iteration, seed, testType, 5e-4, 2e-2);
+		}
+
 		if (!pass && iteration < m_inputs.size())
 			logFailedInput(m_logger.get(), m_inputs[iteration]);
 
diff --git a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h
index b07cee739..4f80ecbaf 100644
--- a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h
+++ b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h
@@ -38,7 +38,6 @@ class CUniformHemisphereTester final : public ITester<UniformHemisphereInputValu
 		bool pass = true;
 		VERIFY_FIELDS(pass, expected, actual, iteration, seed, testType,
 			FieldCheck{"UniformHemisphere::generate",        &R::generated,   1e-5, 1e-5},
-			FieldCheck{"UniformHemisphere::pdf",             &R::pdf,         1e-5, 1e-5},
 			FieldCheck{"UniformHemisphere::generateInverse", &R::inverted,    1e-5, 1e-5},
 			FieldCheck{"UniformHemisphere::forwardPdf",      &R::forwardPdf,  1e-5, 1e-5},
 			FieldCheck{"UniformHemisphere::backwardPdf",     &R::backwardPdf, 1e-5, 1e-5},
diff --git a/37_HLSLSamplingTests/tests/CUniformSphereTester.h b/37_HLSLSamplingTests/tests/CUniformSphereTester.h
index 34f9ad3e9..866d4bc88 100644
--- a/37_HLSLSamplingTests/tests/CUniformSphereTester.h
+++ b/37_HLSLSamplingTests/tests/CUniformSphereTester.h
@@ -38,7 +38,6 @@ class CUniformSphereTester final : public ITester<UniformSphereInputValues, Unif
 		bool pass = true;
 		VERIFY_FIELDS(pass, expected, actual, iteration, seed, testType,
 			FieldCheck{"UniformSphere::generate",        &R::generated,   1e-5, 1e-5},
-			FieldCheck{"UniformSphere::pdf",             &R::pdf,         1e-5, 1e-5},
 			FieldCheck{"UniformSphere::generateInverse", &R::inverted,    1e-5, 1e-5},
 			FieldCheck{"UniformSphere::forwardPdf",      &R::forwardPdf,  1e-5, 1e-5},
 			FieldCheck{"UniformSphere::backwardPdf",     &R::backwardPdf, 1e-5, 1e-5},