From 5eeb47351e29cb6ee02f8d8319f131a2c012b5a2 Mon Sep 17 00:00:00 2001 From: devshgraphicsprogramming Date: Thu, 16 Apr 2026 14:58:47 +0200 Subject: [PATCH 1/5] make NEE work in ex 31 with Global L solid angle sampling of spherical rect --- .../hlsl/next_event_estimator.hlsl | 53 +++++++++++-------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index c8bee786c..29aca1824 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -246,6 +246,7 @@ template struct ShapeSampling { using scalar_type = T; + using vector2_type = vector; using vector3_type = vector; static ShapeSampling create(NBL_CONST_REF_ARG(Shape) rect) @@ -262,48 +263,56 @@ struct ShapeSampling matrix rectNormalBasis; vector rectExtents; rect.getNormalBasis(rectNormalBasis, rectExtents); + shapes::SphericalRectangle sphR0; sphR0.origin = rect.offset; sphR0.extents = rectExtents; sphR0.basis = rectNormalBasis; - scalar_type solidAngle = sphR0.solidAngle(ray.origin).value; - if (solidAngle > numeric_limits::min) - pdf = 1.f / solidAngle; - else - pdf = bit_cast(numeric_limits::infinity); - return pdf; + + // 1.f/0.f gives infinity no special checks needed + return 1.f / sphR0.solidAngle(ray.origin).value; } template vector3_type generate_and_pdf(NBL_REF_ARG(scalar_type) pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(Aniso) interaction, NBL_CONST_REF_ARG(vector3_type) xi) { - const vector3_type N = rect.getNormalTimesArea(); - const vector3_type origin2origin = rect.offset - origin; - matrix rectNormalBasis; vector rectExtents; rect.getNormalBasis(rectNormalBasis, rectExtents); + shapes::SphericalRectangle sphR0; sphR0.origin = rect.offset; sphR0.extents = rectExtents; sphR0.basis = rectNormalBasis; - vector3_type L = hlsl::promote(0.0); + // sampling::SphericalRectangle ssph = sampling::SphericalRectangle::create(sphR0, origin); - if ( ssph.solidAngle > numeric_limits::min) + typename sampling::SphericalRectangle::cache_type cache; + + const vector3_type origin2origin = rect.offset - origin; + vector3_type L = hlsl::promote(0.0); + const bool FastVersion = true; + if (FastVersion) { - typename sampling::SphericalRectangle::cache_type cache; - const vector3_type localDir = ssph.generate(xi.xy, cache); - // not sure if generate() can produce NaN/inf when solidAngle > min - assert(!hlsl::any(hlsl::isinf(localDir) || hlsl::isnan(localDir))); - // transform local direction to world space - L = localDir.x * rectNormalBasis[0] + localDir.y * rectNormalBasis[1] + localDir.z * rectNormalBasis[2]; - pdf = ssph.forwardPdf(xi.xy, cache); + // actually the slowest + //L = ssph.generate(xi.xy, cache); + //newRayMaxT = ssph.computeHitT(L); + + // fastest + const vector3_type localL = ssph.generateNormalizedLocal(xi.xy,cache,newRayMaxT); + L = hlsl::mul(hlsl::transpose(ssph.basis),localL); } else - pdf = bit_cast(numeric_limits::infinity); + { + L = ssph.generateUnnormalized(xi.xy,cache); + const scalar_type rcpLen = hlsl::rsqrt(hlsl::dot(L,L)); + newRayMaxT = 1.f / rcpLen; + L *= rcpLen; + } + // prevent self intersections against the emitter + newRayMaxT -= 0.0001f; - newRayMaxT = hlsl::dot(N, origin2origin) / hlsl::dot(N, L); + pdf = ssph.forwardPdf(xi.xy,cache); return L; } @@ -322,7 +331,7 @@ struct EffectivePolygonMethod NBL_CONSTEXPR_STATIC_INLINE NEEPolygonMethod value = PPM_SOLID_ANGLE; }; - +#if 0 // Projected solid angle NEE for rectangles using "Practical Warps": // bilinear warp over 4-corner NdotL + spherical rectangle sampling. // Same grazing-angle limitations as the triangle variant -- see comments @@ -398,7 +407,7 @@ struct ShapeSampling Shape rect; }; - +#endif template struct NextEventEstimator From 89ecce14443c216b30ff84b837b899045bb5513f Mon Sep 17 00:00:00 2001 From: devshgraphicsprogramming Date: Fri, 17 Apr 2026 03:26:32 +0200 Subject: [PATCH 2/5] prep for rendering with PSA rectangle --- .../hlsl/next_event_estimator.hlsl | 54 +++++++++---------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 29aca1824..91d2a2d5e 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -177,9 +177,7 @@ struct ShapeSampling const vector3_type tri_vertices[3] = {tri.vertex0, tri.vertex1, tri.vertex2}; shapes::SphericalTriangle st = shapes::SphericalTriangle::create(tri_vertices, ray.origin); sampling::ProjectedSphericalTriangle pst = sampling::ProjectedSphericalTriangle::create(st, ray.normalAtOrigin, ray.wasBSDFAtOrigin); - const scalar_type pdf = pst.backwardPdf(L); - // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small - return pdf < numeric_limits::max ? pdf : numeric_limits::max; + return pst.backwardWeight(L); } template @@ -331,7 +329,6 @@ struct EffectivePolygonMethod NBL_CONSTEXPR_STATIC_INLINE NEEPolygonMethod value = PPM_SOLID_ANGLE; }; -#if 0 // Projected solid angle NEE for rectangles using "Practical Warps": // bilinear warp over 4-corner NdotL + spherical rectangle sampling. // Same grazing-angle limitations as the triangle variant -- see comments @@ -361,21 +358,12 @@ struct ShapeSampling sphR0.extents = rectExtents; sphR0.basis = rectNormalBasis; sampling::ProjectedSphericalRectangle psr = sampling::ProjectedSphericalRectangle::create(sphR0, ray.origin, ray.normalAtOrigin, ray.wasBSDFAtOrigin); - // Reconstruct normalized [0,1]^2 position on the rectangle from the ray direction - const vector3_type N = rect.getNormalTimesArea(); - const scalar_type t = hlsl::dot(N, rect.offset - ray.origin) / hlsl::dot(N, ray.direction); - const vector3_type hitPoint = ray.origin + ray.direction * t; - const vector3_type localHit = hitPoint - rect.offset; - const vector p = vector(hlsl::dot(localHit, rectNormalBasis[0]) / rectExtents.x, hlsl::dot(localHit, rectNormalBasis[1]) / rectExtents.y); - const scalar_type pdf = psr.backwardPdf(p); - return pdf < numeric_limits::max ? pdf : numeric_limits::max; + return psr.backwardWeight(ray.direction); } template vector3_type generate_and_pdf(NBL_REF_ARG(scalar_type) pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(Aniso) interaction, NBL_CONST_REF_ARG(vector3_type) xi) { - const vector3_type N = rect.getNormalTimesArea(); - const vector3_type origin2origin = rect.offset - origin; matrix rectNormalBasis; vector rectExtents; @@ -384,30 +372,40 @@ struct ShapeSampling sphR0.origin = rect.offset; sphR0.extents = rectExtents; sphR0.basis = rectNormalBasis; - vector3_type L = hlsl::promote(0.0); sampling::ProjectedSphericalRectangle psr = sampling::ProjectedSphericalRectangle::create(sphR0, origin, interaction.getN(), interaction.isMaterialBSDF()); - const scalar_type solidAngle = psr.sphrect.solidAngle; - if (solidAngle > numeric_limits::min) + typename sampling::ProjectedSphericalRectangle::cache_type cache; + + const vector3_type origin2origin = rect.offset - origin; + vector3_type L = hlsl::promote(0.0); + const bool FastVersion = true; + if (FastVersion) { - typename sampling::ProjectedSphericalRectangle::cache_type cache; - const vector3_type localDir = psr.generate(xi.xy, cache); - // not sure if generate() can produce NaN/inf when solidAngle > min - assert(!hlsl::any(hlsl::isinf(localDir) || hlsl::isnan(localDir))); - // transform local direction to world space - L = localDir.x * rectNormalBasis[0] + localDir.y * rectNormalBasis[1] + localDir.z * rectNormalBasis[2]; - pdf = psr.forwardPdf(xi.xy, cache); + // actually the slowest + //L = psr.generate(xi.xy, cache); + //newRayMaxT = psr.sphrect.computeHitT(L); + + // fastest + const vector3_type localL = psr.generateNormalizedLocal(xi.xy,cache,newRayMaxT); + // hopefully CSE kicks in for the `UsePdfAsWeight==true` + L = hlsl::mul(hlsl::transpose(psr.sphrect.basis),localL); } else - pdf = bit_cast(numeric_limits::infinity); - - newRayMaxT = hlsl::dot(N, origin2origin) / hlsl::dot(N, L); + { + L = psr.generateUnnormalized(xi.xy,cache); + const scalar_type rcpLen = hlsl::rsqrt(hlsl::dot(L,L)); + newRayMaxT = 1.f / rcpLen; + L *= rcpLen; + } + // prevent self intersections against the emitter + newRayMaxT -= 0.0001f; + + pdf = psr.forwardPdf(xi.xy,cache); return L; } Shape rect; }; -#endif template struct NextEventEstimator From fb5cfa2bcaa0a92aafb429f3d390658d28d1ca02 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Wed, 22 Apr 2026 01:16:12 +0300 Subject: [PATCH 3/5] jacobian tests, better benchmarks, addressed comments --- 37_HLSLSamplingTests/CMakeLists.txt | 172 ++++++- .../app_resources/common/alias_table.hlsl | 2 + .../app_resources/common/array_accessor.hlsl | 1 - .../app_resources/common/bilinear.hlsl | 6 + .../common/box_muller_transform.hlsl | 3 + .../common/concentric_mapping.hlsl | 12 +- .../common/cumulative_probability.hlsl | 2 + .../common/discrete_sampler_bench.hlsl | 3 - .../app_resources/common/jacobian_test.hlsl | 264 ++++++++++ .../app_resources/common/linear.hlsl | 3 + .../app_resources/common/polar_mapping.hlsl | 20 +- .../common/projected_hemisphere.hlsl | 8 +- .../common/projected_sphere.hlsl | 3 + .../common/projected_spherical_rectangle.hlsl | 44 +- .../common/projected_spherical_triangle.hlsl | 21 +- .../common/spherical_rectangle.hlsl | 16 +- .../common/spherical_triangle.hlsl | 17 +- .../common/uniform_hemisphere.hlsl | 8 +- .../app_resources/common/uniform_sphere.hlsl | 9 +- .../shaders/alias_table_test.comp.hlsl | 11 +- .../shaders/bilinear_test.comp.hlsl | 28 +- .../box_muller_transform_test.comp.hlsl | 28 +- .../shaders/concentric_mapping_test.comp.hlsl | 20 +- .../cumulative_probability_test.comp.hlsl | 6 +- .../shaders/linear_test.comp.hlsl | 28 +- .../shaders/polar_mapping_test.comp.hlsl | 20 +- .../projected_hemisphere_test.comp.hlsl | 20 +- .../shaders/projected_sphere_test.comp.hlsl | 20 +- ...ojected_spherical_rectangle_test.comp.hlsl | 54 +- ...rojected_spherical_triangle_test.comp.hlsl | 41 +- .../spherical_rectangle_test.comp.hlsl | 102 +++- .../shaders/spherical_triangle.comp.hlsl | 40 +- .../shaders/test_compile.comp.hlsl | 78 ++- .../shaders/uniform_hemisphere_test.comp.hlsl | 20 +- .../shaders/uniform_sphere_test.comp.hlsl | 20 +- .../benchmarks/CDiscreteSamplerBenchmark.h | 391 +++++++------- .../benchmarks/CSamplerBenchmark.h | 6 +- 37_HLSLSamplingTests/main.cpp | 206 +++++--- .../tests/CAliasTableGPUTester.h | 1 + 37_HLSLSamplingTests/tests/CBilinearTester.h | 5 +- .../tests/CBoxMullerTransformTester.h | 1 + .../tests/CConcentricMappingTester.h | 3 +- .../tests/CCumulativeProbabilityGPUTester.h | 1 + 37_HLSLSamplingTests/tests/CLinearTester.h | 9 +- .../tests/CPolarMappingTester.h | 3 +- .../tests/CProjectedHemisphereTester.h | 7 +- .../tests/CProjectedSphereTester.h | 5 +- .../CProjectedSphericalRectangleTester.h | 81 ++- .../tests/CProjectedSphericalTriangleTester.h | 30 +- .../tests/CSphericalRectangleTester.h | 33 +- .../tests/CSphericalTriangleTester.h | 13 +- .../tests/CUniformHemisphereTester.h | 3 +- .../tests/CUniformSphereTester.h | 3 +- .../tests/SamplerTestHelpers.h | 482 ++++++++++++------ .../tests/property/CSamplerPropertyTester.h | 220 +++++--- 55 files changed, 1869 insertions(+), 784 deletions(-) create mode 100644 37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl diff --git a/37_HLSLSamplingTests/CMakeLists.txt b/37_HLSLSamplingTests/CMakeLists.txt index 2ac238c33..12cbb5bb1 100644 --- a/37_HLSLSamplingTests/CMakeLists.txt +++ b/37_HLSLSamplingTests/CMakeLists.txt @@ -113,8 +113,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/linear_test.comp.hlsl\", - \"KEY\": \"linear_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"linear_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/linear_test.comp.hlsl\", + \"KEY\": \"linear_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/uniform_hemisphere_test.comp.hlsl\", @@ -122,8 +127,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/uniform_hemisphere_test.comp.hlsl\", - \"KEY\": \"uniform_hemisphere_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"uniform_hemisphere_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/uniform_hemisphere_test.comp.hlsl\", + \"KEY\": \"uniform_hemisphere_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/uniform_sphere_test.comp.hlsl\", @@ -131,8 +141,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/uniform_sphere_test.comp.hlsl\", - \"KEY\": \"uniform_sphere_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"uniform_sphere_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/uniform_sphere_test.comp.hlsl\", + \"KEY\": \"uniform_sphere_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/projected_hemisphere_test.comp.hlsl\", @@ -140,8 +155,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/projected_hemisphere_test.comp.hlsl\", - \"KEY\": \"projected_hemisphere_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"projected_hemisphere_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_hemisphere_test.comp.hlsl\", + \"KEY\": \"projected_hemisphere_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/projected_sphere_test.comp.hlsl\", @@ -149,8 +169,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/projected_sphere_test.comp.hlsl\", - \"KEY\": \"projected_sphere_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"projected_sphere_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_sphere_test.comp.hlsl\", + \"KEY\": \"projected_sphere_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\", @@ -158,8 +183,18 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\", - \"KEY\": \"spherical_triangle_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"spherical_triangle_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\", + \"KEY\": \"spherical_triangle_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\", + \"KEY\": \"spherical_triangle_bench_create_only\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"] }, { \"INPUT\": \"app_resources/shaders/concentric_mapping_test.comp.hlsl\", @@ -167,8 +202,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/concentric_mapping_test.comp.hlsl\", - \"KEY\": \"concentric_mapping_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"concentric_mapping_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/concentric_mapping_test.comp.hlsl\", + \"KEY\": \"concentric_mapping_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/polar_mapping_test.comp.hlsl\", @@ -176,8 +216,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/polar_mapping_test.comp.hlsl\", - \"KEY\": \"polar_mapping_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"polar_mapping_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/polar_mapping_test.comp.hlsl\", + \"KEY\": \"polar_mapping_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/bilinear_test.comp.hlsl\", @@ -185,8 +230,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/bilinear_test.comp.hlsl\", - \"KEY\": \"bilinear_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"bilinear_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/bilinear_test.comp.hlsl\", + \"KEY\": \"bilinear_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/box_muller_transform_test.comp.hlsl\", @@ -194,8 +244,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/box_muller_transform_test.comp.hlsl\", - \"KEY\": \"box_muller_transform_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"box_muller_transform_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/box_muller_transform_test.comp.hlsl\", + \"KEY\": \"box_muller_transform_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\", @@ -203,8 +258,18 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\", - \"KEY\": \"projected_spherical_triangle_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"projected_spherical_triangle_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\", + \"KEY\": \"projected_spherical_triangle_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\", + \"KEY\": \"projected_spherical_triangle_bench_create_only\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"] }, { \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\", @@ -212,8 +277,18 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\", - \"KEY\": \"projected_spherical_rectangle_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"projected_spherical_rectangle_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"projected_spherical_rectangle_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"projected_spherical_rectangle_bench_create_only\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"] }, { \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", @@ -221,8 +296,48 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", - \"KEY\": \"spherical_rectangle_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"spherical_rectangle_bench_1_1_shape_observer\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_1_sa_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\", \"-DBENCH_VARIANT_SA_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_1_r0_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\", \"-DBENCH_VARIANT_R0_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_16_shape_observer\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_16_sa_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\", \"-DBENCH_VARIANT_SA_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_16_r0_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\", \"-DBENCH_VARIANT_R0_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_create_only_shape_observer\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_create_only_sa_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\", \"-DBENCH_VARIANT_SA_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_create_only_r0_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\", \"-DBENCH_VARIANT_R0_EXTENTS\"] }, { \"INPUT\": \"app_resources/shaders/alias_table_test.comp.hlsl\", @@ -241,6 +356,11 @@ set(JSON " \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", \"KEY\": \"cumulative_probability_bench\", \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + }, + { + \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", + \"KEY\": \"cumulative_probability_yolo_bench\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_CUMPROB_YOLO_READS\"] } ] ") diff --git a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl index da7048a1f..bb1ed54ef 100644 --- a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl @@ -27,6 +27,7 @@ struct AliasTableTestResults float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; // Pre-computed alias table for weights {1, 2, 3, 4}: @@ -63,6 +64,7 @@ struct AliasTableTestExecutor output.backwardPdf = sampler.backwardPdf(output.generatedIndex); output.forwardWeight = sampler.forwardWeight(input.u, cache); output.backwardWeight = sampler.backwardWeight(output.generatedIndex); + output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; } }; diff --git a/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl b/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl index 1f0a68195..5e679c98a 100644 --- a/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl @@ -12,7 +12,6 @@ struct ArrayAccessor using value_type = T; template void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC { val = V(data[i]); } - T operator[](uint32_t i) NBL_CONST_MEMBER_FUNC { return data[i]; } T data[N]; }; diff --git a/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl b/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl index 64a13d3e1..752e547ce 100644 --- a/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -19,6 +20,7 @@ struct BilinearTestResults float32_t forwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; struct BilinearTestExecutor @@ -37,6 +39,10 @@ struct BilinearTestExecutor output.backwardPdf = sampler.backwardPdf(output.generated); output.backwardWeight = sampler.backwardWeight(output.generated); } + // marginFactor = 3: same reasoning as Linear; Bilinear is two Linear stages, so the skewed- + // coefficient inverse-CDF d^2/du^2 divergence near [0,1]^2 boundary applies on both axes. + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 3.0f); + } }; diff --git a/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl b/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl index e8247e259..2b86e8560 100644 --- a/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -21,6 +22,7 @@ struct BoxMullerTransformTestResults float32_t forwardWeight; float32_t backwardWeight; float32_t2 separateBackwardPdf; + float32_t jacobianProduct; }; struct BoxMullerTransformTestExecutor @@ -40,6 +42,7 @@ struct BoxMullerTransformTestExecutor output.backwardPdf = sampler.backwardPdf(output.generated); output.backwardWeight = sampler.backwardWeight(output.generated); output.separateBackwardPdf = sampler.separateBackwardPdf(output.generated); + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 10.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl b/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl index 67d8e5869..e0c6a570c 100644 --- a/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -20,6 +21,7 @@ struct ConcentricMappingTestResults float32_t forwardWeight; float32_t backwardWeight; float32_t jacobianProduct; + float32_t inverseJacobianPdf; float32_t2 roundtripError; }; @@ -39,7 +41,15 @@ struct ConcentricMappingTestExecutor output.backwardWeight = sampling::ConcentricMapping::backwardWeight(input.u); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = float32_t(1.0 / output.backwardPdf) * output.forwardPdf; + { + sampling::ConcentricMapping sampler; + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 1.0f); + // Disk-center singularity: concentric atan2 blows up as r->0. + const float32_t diskRadius = nbl::hlsl::length(output.mapped); + output.inverseJacobianPdf = diskRadius < 0.1f + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.mapped, output.backwardPdf, 0.0f, 1e30f); + } } }; diff --git a/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl b/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl index f58a22741..e66cb44fe 100644 --- a/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl @@ -24,6 +24,7 @@ struct CumProbTestResults float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; // Pre-computed CDF table for weights {1, 2, 3, 4}: @@ -46,6 +47,7 @@ struct CumProbTestExecutor output.backwardPdf = sampler.backwardPdf(output.generatedIndex); output.forwardWeight = sampler.forwardWeight(input.u, cache); output.backwardWeight = sampler.backwardWeight(output.generatedIndex); + output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; } }; diff --git a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl index 9f1fec422..d5c1d313c 100644 --- a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl @@ -5,9 +5,6 @@ using namespace nbl::hlsl; -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif NBL_CONSTEXPR uint32_t WorkgroupSize = WORKGROUP_SIZE; struct AliasTablePushConstants diff --git a/37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl b/37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl new file mode 100644 index 000000000..f949f5b86 --- /dev/null +++ b/37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl @@ -0,0 +1,264 @@ +#ifndef _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_JACOBIAN_TEST_INCLUDED_ +#define _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_JACOBIAN_TEST_INCLUDED_ + +#include +#include + +using namespace nbl::hlsl; + +// Negative sentinels signal "skipped" to the host verifier; the value encodes the reason. +static const float32_t JACOBIAN_SKIP_U_DOMAIN = -1.0f; +static const float32_t JACOBIAN_SKIP_CREASE = -2.0f; +static const float32_t JACOBIAN_SKIP_HEMI_BOUNDARY = -3.0f; +static const float32_t JACOBIAN_SKIP_BWD_PDF_RANGE = -4.0f; +static const float32_t JACOBIAN_SKIP_CODOMAIN_SINGULARITY = -5.0f; + + +template +struct ForwardJacobianMeasure; + +// Signed step that stays inside [0,1]: flip direction when u is in the upper half so u +/- eps +// never overshoots the domain. Magnitude is what matters (the stencil results take abs/length). +template +T signedEps(T u, T eps) +{ + return u > T(0.5) ? -eps : eps; +} + +template +struct ForwardJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + using cache_type = typename Sampler::cache_type; + + static scalar_type compute(Sampler _sampler, domain_type u, scalar_type eps, codomain_type L) + { + cache_type c; + const codomain_type L_x = _sampler.generate(u + signedEps(u, eps), c); + return nbl::hlsl::abs(L_x - L) / eps; + } +}; + +template +struct ForwardJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + using cache_type = typename Sampler::cache_type; + + static scalar_type compute(Sampler _sampler, domain_type u, scalar_type eps, codomain_type L) + { + domain_type u_x = u; + u_x[0] += signedEps(u[0], eps); + domain_type u_y = u; + u_y[1] += signedEps(u[1], eps); + cache_type c; + const codomain_type L_x = _sampler.generate(u_x, c); + const codomain_type L_y = _sampler.generate(u_y, c); + using matrix2_type = matrix; + const scalar_type det = nbl::hlsl::determinant(matrix2_type(L_x - L, L_y - L)); + return nbl::hlsl::abs(det) / (eps * eps); + } +}; + +template +struct ForwardJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + using cache_type = typename Sampler::cache_type; + + static scalar_type compute(Sampler _sampler, domain_type u, scalar_type eps, codomain_type L) + { + domain_type u_x = u; + u_x[0] += signedEps(u[0], eps); + domain_type u_y = u; + u_y[1] += signedEps(u[1], eps); + cache_type c; + const codomain_type L_x = _sampler.generate(u_x, c); + const codomain_type L_y = _sampler.generate(u_y, c); + return nbl::hlsl::length(nbl::hlsl::cross(L_x - L, L_y - L)) / (eps * eps); + } +}; + +// 3D domain: stencil perturbs u[0] and u[1] only, so the (2,3) body applies unchanged. +template +struct ForwardJacobianMeasure : ForwardJacobianMeasure +{ +}; + + +template +struct DomainMarginCheck; + +template +struct DomainMarginCheck +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + static bool outsideMargin(domain_type u, scalar_type margin) + { + return u < margin || u > scalar_type(1) - margin; + } +}; + +template +struct DomainMarginCheck +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + static bool outsideMargin(domain_type u, scalar_type margin) + { + return u[0] < margin || u[0] > scalar_type(1) - margin || u[1] < margin || u[1] > scalar_type(1) - margin; + } +}; + +// 3D domain: forward stencil only perturbs u[0] and u[1], so u[2] is irrelevant and (2) applies. +template +struct DomainMarginCheck : DomainMarginCheck +{ +}; + +enum JacobianMode : uint32_t +{ + JACOBIAN_PLAIN = 0, + JACOBIAN_CONCENTRIC = 1, // + concentric crease skip + JACOBIAN_CONCENTRIC_UXFOLD = 2 // + crease + u.x=0.5 hemi-boundary skip +}; + +// marginFactor scales the u-domain skip to marginFactor * eps. Use > 1 only for samplers whose +// stencil bias extends past a single eps-step (e.g. Arvo spherical triangle: sinZ ~ sqrt(u.y) +// gives O(h/u.y) forward-diff bias, so u.y in [0, k*eps] must be skipped). +template +float32_t computeJacobianProduct(Sampler _sampler, typename Sampler::domain_type u, float32_t eps, float32_t marginFactor) +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + using cache_type = typename Sampler::cache_type; + + NBL_IF_CONSTEXPR(Mode != JACOBIAN_PLAIN) + { + // Cast via float32_t2 so this block typechecks for scalar / vec2 / vec3 domains alike + // (HLSL splats scalars, identity on vec2, .xy on vec3). 1D samplers never reach here. + const float32_t2 uxy = (float32_t2)u; + const float32_t ux = uxy.x; + const float32_t uy = uxy.y; + + NBL_IF_CONSTEXPR(Mode == JACOBIAN_CONCENTRIC_UXFOLD) + { + if (nbl::hlsl::abs(ux - float32_t(0.5)) <= float32_t(2e-3)) + return JACOBIAN_SKIP_HEMI_BOUNDARY; + } + + const bool uxFold = (Mode == JACOBIAN_CONCENTRIC_UXFOLD); + // Empirical: the concentric C0 crease's stencil bias spreads wider than the 2*eps geometric + // straddle band. Non-uxFold 6e-3 covers the disk-center residual for Projected samplers; + // uxFold 1e-2 accounts for the doubled local_ux rate when u.x is folded. + const float32_t creaseBand = uxFold ? float32_t(1e-2) : float32_t(6e-3); + const float32_t local_ux = uxFold ? nbl::hlsl::abs(float32_t(2) * ux - float32_t(1)) : ux; + const float32_t a = float32_t(2) * local_ux - float32_t(1); + const float32_t b = float32_t(2) * uy - float32_t(1); + if (nbl::hlsl::abs(nbl::hlsl::abs(a) - nbl::hlsl::abs(b)) <= creaseBand) + return JACOBIAN_SKIP_CREASE; + } + + using margin_check_type = DomainMarginCheck::Dimension>; + if (margin_check_type::outsideMargin(u, scalar_type(eps * marginFactor))) + return JACOBIAN_SKIP_U_DOMAIN; + + // Generate on a copy: some samplers mutate u through NBL_REF_ARG (e.g. ProjectedSphere + // consumes u.z for hemisphere selection), and the perturbations below need the original u. + cache_type cache; + domain_type uGen = u; + const codomain_type L = _sampler.generate(uGen, cache); + const scalar_type pdf = _sampler.forwardPdf(uGen, cache); + + using measure_type = ForwardJacobianMeasure::Dimension, vector_traits::Dimension>; + const scalar_type measure = measure_type::compute(_sampler, u, scalar_type(eps), L); + + return pdf * measure; +} + + +template +struct InverseJacobianMeasure; + +template +struct InverseJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + + static scalar_type compute(Sampler _sampler, codomain_type x, scalar_type eps) + { + const scalar_type twoEps = scalar_type(2) * eps; + codomain_type x0_lo = x; + x0_lo[0] -= eps; + codomain_type x0_hi = x; + x0_hi[0] += eps; + codomain_type x1_lo = x; + x1_lo[1] -= eps; + codomain_type x1_hi = x; + x1_hi[1] += eps; + domain_type u0_lo = _sampler.generateInverse(x0_lo); + domain_type u0_hi = _sampler.generateInverse(x0_hi); + domain_type u1_lo = _sampler.generateInverse(x1_lo); + domain_type u1_hi = _sampler.generateInverse(x1_hi); + const domain_type dudx0 = (u0_hi - u0_lo) / twoEps; + const domain_type dudx1 = (u1_hi - u1_lo) / twoEps; + using matrix2_type = matrix; + const scalar_type det = nbl::hlsl::determinant(matrix2_type(dudx0, dudx1)); + return nbl::hlsl::abs(det); + } +}; + +template +struct InverseJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + + static scalar_type compute(Sampler _sampler, codomain_type x, scalar_type eps) + { + const scalar_type twoEps = scalar_type(2) * eps; + codomain_type t1, t2; + const codomain_type up = nbl::hlsl::abs(x[2]) < scalar_type(0.999) + ? codomain_type(scalar_type(0), scalar_type(0), scalar_type(1)) + : codomain_type(scalar_type(1), scalar_type(0), scalar_type(0)); + t1 = nbl::hlsl::normalize(nbl::hlsl::cross(up, x)); + t2 = nbl::hlsl::cross(x, t1); + domain_type u_t1_lo = _sampler.generateInverse(nbl::hlsl::normalize(x - t1 * eps)); + domain_type u_t1_hi = _sampler.generateInverse(nbl::hlsl::normalize(x + t1 * eps)); + domain_type u_t2_lo = _sampler.generateInverse(nbl::hlsl::normalize(x - t2 * eps)); + domain_type u_t2_hi = _sampler.generateInverse(nbl::hlsl::normalize(x + t2 * eps)); + const domain_type dudt1 = (u_t1_hi - u_t1_lo) / twoEps; + const domain_type dudt2 = (u_t2_hi - u_t2_lo) / twoEps; + using matrix2_type = matrix; + const scalar_type det = nbl::hlsl::determinant(matrix2_type(dudt1, dudt2)); + return nbl::hlsl::abs(det); + } +}; + +template +float32_t computeInverseJacobianPdf(Sampler _sampler, typename Sampler::codomain_type sample, float32_t backwardPdf, float32_t pdfMin, float32_t pdfMax) +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + + if (backwardPdf < scalar_type(pdfMin) || backwardPdf > scalar_type(pdfMax)) + return JACOBIAN_SKIP_BWD_PDF_RANGE; + + using measure_type = InverseJacobianMeasure::Dimension, vector_traits::Dimension>; + const scalar_type eps = scalar_type(1e-3); + return measure_type::compute(_sampler, sample, eps); +} + +#endif diff --git a/37_HLSLSamplingTests/app_resources/common/linear.hlsl b/37_HLSLSamplingTests/app_resources/common/linear.hlsl index b27d88e5b..af269ad2f 100644 --- a/37_HLSLSamplingTests/app_resources/common/linear.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/linear.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -19,6 +20,7 @@ struct LinearTestResults float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; struct LinearTestExecutor @@ -37,6 +39,7 @@ struct LinearTestExecutor output.backwardPdf = _sampler.backwardPdf(output.generated); output.backwardWeight = _sampler.backwardWeight(output.generated); } + output.jacobianProduct = computeJacobianProduct(_sampler, input.u, 1e-3f, 3.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl b/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl index 82e020fdc..e4b8ffabb 100644 --- a/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -20,6 +21,7 @@ struct PolarMappingTestResults float32_t forwardWeight; float32_t backwardWeight; float32_t jacobianProduct; + float32_t inverseJacobianPdf; float32_t2 roundtripError; }; @@ -39,7 +41,23 @@ struct PolarMappingTestExecutor output.backwardWeight = sampling::PolarMapping::backwardWeight(input.u); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = float32_t(1.0 / output.backwardPdf) * output.forwardPdf; + + { + sampling::PolarMapping sampler; + // marginFactor = 3: r = sqrt(u.x) gives O(h/u.x) forward-diff bias near u.x=0, so skip + // u.x within 3*eps of the domain boundary (same reasoning as Linear's skewed-density case). + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 3.0f); + // Two inverse singularities: + // - disk center: atan2 diverges as r -> 0 + // - atan2 branch cut at y=0, x>0: the stencil's +/-eps in y straddles the 2*pi wrap, + // producing du.y/eps ~ 1/eps spikes (seen as test values ~305-862 with eps=1e-3). + const float32_t polarRadius = nbl::hlsl::length(output.mapped); + const bool onCutBand = nbl::hlsl::abs(output.mapped.y) < 5e-3f && output.mapped.x > 0.0f; + output.inverseJacobianPdf = (polarRadius < 0.1f || onCutBand) + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.mapped, output.backwardPdf, 0.0f, 1e30f); + } + } }; diff --git a/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl index 9697cf0df..c48697b03 100644 --- a/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -22,6 +23,7 @@ struct ProjectedHemisphereTestResults float32_t backwardWeight; float32_t2 roundtripError; float32_t jacobianProduct; + float32_t inverseJacobianPdf; }; struct ProjectedHemisphereTestExecutor @@ -43,7 +45,11 @@ struct ProjectedHemisphereTestExecutor output.backwardWeight = sampler.backwardWeight(output.generated); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 5.0f); + const float32_t phDiskR = nbl::hlsl::length((float32_t2)output.generated); + output.inverseJacobianPdf = phDiskR < 0.1f + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 1e-3f, 1e30f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl index e9886b61d..a78a937f6 100644 --- a/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -20,6 +21,7 @@ struct ProjectedSphereTestResults float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; struct ProjectedSphereTestExecutor @@ -38,6 +40,7 @@ struct ProjectedSphereTestExecutor } output.backwardPdf = sampler.backwardPdf(output.generated); output.backwardWeight = sampler.backwardWeight(output.generated); + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 5.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl index 8370952ca..4aed7d9c3 100644 --- a/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl @@ -4,6 +4,7 @@ #include #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -24,12 +25,10 @@ struct ProjectedSphericalRectangleTestResults float32_t2 surfaceOffset; float32_t3 referenceDirection; float32_t forwardPdf; - float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; - float32_t backwardPdfAtGenerated; - float32_t backwardWeightAtGenerated; float32_t2 extents; + float32_t jacobianProduct; }; struct ProjectedSphericalRectangleTestExecutor @@ -46,30 +45,29 @@ struct ProjectedSphericalRectangleTestExecutor output.extents = rect.extents; sampling::ProjectedSphericalRectangle::cache_type cache; + output.generated = sampler.generate(input.u, cache); + output.forwardPdf = sampler.forwardPdf(input.u, cache); + output.forwardWeight = sampler.forwardWeight(input.u, cache); + // backwardWeight now takes a 3D direction; evaluate at generated L. + output.backwardWeight = sampler.backwardWeight(output.generated); + + float32_t2 absXY; { - output.generated = sampler.generate(input.u, cache); - output.forwardPdf = sampler.forwardPdf(input.u, cache); - output.forwardWeight = sampler.forwardWeight(input.u, cache); - } - { - sampling::ProjectedSphericalRectangle::cache_type offsetCache; - output.surfaceOffset = sampler.generateSurfaceOffset(input.u, offsetCache); + typename sampling::Bilinear::cache_type bc; + const float32_t2 warped = sampler.bilinearPatch.generate(input.u, bc); + typename sampling::SphericalRectangle::cache_type sphrectCache; + absXY = sampler.sphrect.generateLocalBasisXY(warped, sphrectCache); + output.surfaceOffset = absXY - float32_t2(sampler.sphrect.r0.x, sampler.sphrect.r0.y); } - // reference direction: reconstruct local 3D point from surfaceOffset and normalize { - const float32_t3 localPoint = sampler.sphrect.r0 + float32_t3(output.surfaceOffset.x, output.surfaceOffset.y, float32_t(0)); - output.referenceDirection = nbl::hlsl::normalize(localPoint); + const float32_t3 localPoint = float32_t3(absXY.x, absXY.y, sampler.sphrect.r0.z); + const float32_t3 localDir = nbl::hlsl::normalize(localPoint); + output.referenceDirection = sampler.sphrect.basis[0] * localDir[0] + + sampler.sphrect.basis[1] * localDir[1] + + sampler.sphrect.basis[2] * localDir[2]; } - // Test backwardPdf/Weight at the rect center: a deterministic interior point - // that avoids amplifying generate's FP errors through backward evaluation. - const float32_t2 center = float32_t2(0.5, 0.5); - output.backwardPdf = sampler.backwardPdf(center); - output.backwardWeight = sampler.backwardWeight(center); - // Use cache.warped (the [0,1]^2 input to the spherical rect warp) for consistency - // checks, NOT generated/extents (the nonlinear warp output). The bilinear in - // forwardPdf evaluates at cache.warped, so backwardPdf must too. - output.backwardPdfAtGenerated = sampler.backwardPdf(cache.warped); - output.backwardWeightAtGenerated = sampler.backwardWeight(cache.warped); + + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 10.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl index 5c81e53e0..0c424590b 100644 --- a/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl @@ -4,6 +4,7 @@ #include #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -21,11 +22,10 @@ struct ProjectedSphericalTriangleTestResults { float32_t3 generated; float32_t forwardPdf; - float32_t backwardPdf; - float32_t backwardPdfAtGenerated; float32_t forwardWeight; float32_t backwardWeight; float32_t backwardWeightAtGenerated; + float32_t jacobianProduct; }; struct ProjectedSphericalTriangleTestExecutor @@ -43,15 +43,20 @@ struct ProjectedSphericalTriangleTestExecutor output.forwardPdf = sampler.forwardPdf(input.u, cache); output.forwardWeight = sampler.forwardWeight(input.u, cache); } - // Test backwardPdf/Weight at the triangle centroid: a deterministic interior point computed - // from only basic arithmetic + sqrt (IEEE 754 exact), so CPU and GPU agree bit-exactly. - // Using output.generated would amplify generate's transcendental FP errors through - // generateInverse's acos, producing CPU/GPU divergence. const float32_t3 center = nbl::hlsl::normalize(input.vertex0 + input.vertex1 + input.vertex2); - output.backwardPdf = sampler.backwardPdf(center); output.backwardWeight = sampler.backwardWeight(center); - output.backwardPdfAtGenerated = sampler.backwardPdf(output.generated); output.backwardWeightAtGenerated = sampler.backwardWeight(output.generated); + // Check the bilinear-warped (inner) u directly: for skinny triangles with a strongly biased + // receiver normal, outer u well inside [0,1] can still warp to inner u <~ 0.02 where Arvo's + // sqrt(sinZ) noise dominates. Pre-skip on the inner u instead of padding an outer marginFactor. + sampling::Bilinear::cache_type bc; + const float32_t2 innerU = sampler.bilinearPatch.generate(input.u, bc); + const float32_t innerMargin = 0.02f; + const bool innerNearEdge = innerU.x < innerMargin || innerU.x > (1.0f - innerMargin) + || innerU.y < innerMargin || innerU.y > (1.0f - innerMargin); + output.jacobianProduct = innerNearEdge + ? JACOBIAN_SKIP_U_DOMAIN + : computeJacobianProduct(sampler, input.u, 1e-3f, 1.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl b/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl index 9ae4df256..4f8d20964 100644 --- a/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl @@ -4,6 +4,7 @@ #include #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -26,6 +27,7 @@ struct SphericalRectangleTestResults float32_t forwardWeight; float32_t backwardWeight; float32_t2 extents; + float32_t jacobianProduct; }; struct SphericalRectangleTestExecutor @@ -47,17 +49,23 @@ struct SphericalRectangleTestExecutor output.forwardPdf = sampler.forwardPdf(input.u, cache); output.forwardWeight = sampler.forwardWeight(input.u, cache); } + float32_t2 absXY; { sampling::SphericalRectangle::cache_type cache; - output.surfaceOffset = sampler.generateSurfaceOffset(input.u, cache); + absXY = sampler.generateLocalBasisXY(input.u, cache); + output.surfaceOffset = absXY - float32_t2(sampler.r0.x, sampler.r0.y); } - // reference direction: reconstruct local 3D point from surfaceOffset and normalize { - const float32_t3 localPoint = sampler.r0 + float32_t3(output.surfaceOffset.x, output.surfaceOffset.y, float32_t(0)); - output.referenceDirection = nbl::hlsl::normalize(localPoint); + const float32_t3 localDir = nbl::hlsl::normalize(float32_t3(absXY.x, absXY.y, sampler.r0.z)); + output.referenceDirection = sampler.basis[0] * localDir[0] + + sampler.basis[1] * localDir[1] + + sampler.basis[2] * localDir[2]; } output.backwardPdf = sampler.backwardPdf(output.generated); output.backwardWeight = sampler.backwardWeight(output.generated); + // marginFactor = 3: __generate's sin_au denominator goes through catastrophic cancellation + // for u.x within ~2*eps of 0 or 1 (au near n*pi), leaving ~0.5% residual at factor 3. + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 3.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl index 291661629..1828139d4 100644 --- a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -24,6 +25,7 @@ struct SphericalTriangleTestResults float32_t backwardWeight; float32_t2 roundtripError; float32_t jacobianProduct; + float32_t inverseJacobianPdf; // Minimum signed distance to a triangle edge (sin of angular distance to nearest great circle). // Positive = inside, negative = outside. Allows tolerance at boundaries. float32_t generatedInside; @@ -39,7 +41,7 @@ struct SphericalTriangleTestExecutor const float32_t3 verts[3] = { input.vertex0, input.vertex1, input.vertex2 }; shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); - sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); + sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); // Forward: u -> v { @@ -57,10 +59,8 @@ struct SphericalTriangleTestExecutor output.backwardWeight = sampler.backwardWeight(output.generated); } // Roundtrip error: ||u - u'|| - output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - - // Jacobian product: (1/forwardPdf) * backwardPdf should equal 1 for bijective samplers - output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + output.roundtripError = nbl::hlsl::abs(input.u - output.inverted);. + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 20.0f); // Domain preservation: // A point is inside the spherical triangle iff it is on the "inside" half-plane @@ -79,6 +79,13 @@ struct SphericalTriangleTestExecutor float32_t2 u = output.inverted; output.invertedInDomain = nbl::hlsl::min(nbl::hlsl::min(u.x, float32_t(1.0) - u.x), nbl::hlsl::min(u.y, float32_t(1.0) - u.y)); + + const float32_t uMargin = 1e-2f; + const bool nearUBoundary = output.inverted.x < uMargin || output.inverted.x > (1.0f - uMargin) + || output.inverted.y < uMargin || output.inverted.y > (1.0f - uMargin); + output.inverseJacobianPdf = nearUBoundary + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 0.1f, 10.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl b/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl index 76a724774..fb51838c7 100644 --- a/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -22,6 +23,7 @@ struct UniformHemisphereTestResults float32_t backwardWeight; float32_t2 roundtripError; float32_t jacobianProduct; + float32_t inverseJacobianPdf; }; struct UniformHemisphereTestExecutor @@ -42,7 +44,11 @@ struct UniformHemisphereTestExecutor output.backwardWeight = sampler.backwardWeight(output.generated); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 1.0f); + const float32_t uhDiskR = nbl::hlsl::length((float32_t2)output.generated); + output.inverseJacobianPdf = uhDiskR < 0.1f + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 0.0f, 1e30f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl b/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl index 3780b82ef..3737f4575 100644 --- a/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -22,6 +23,7 @@ struct UniformSphereTestResults float32_t backwardWeight; float32_t2 roundtripError; float32_t jacobianProduct; + float32_t inverseJacobianPdf; }; struct UniformSphereTestExecutor @@ -43,7 +45,12 @@ struct UniformSphereTestExecutor output.backwardWeight = sampler.backwardWeight(output.generated); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 1.0f); + const float32_t usDiskR = nbl::hlsl::length((float32_t2)output.generated); + const float32_t absZ = nbl::hlsl::abs(output.generated.z); + output.inverseJacobianPdf = (absZ < 0.1f || usDiskR < 0.1f) + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 0.0f, 1e30f); } }; diff --git a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl index 72c4f1977..67047f997 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl @@ -58,18 +58,15 @@ void main() float32_t xi = float32_t(nbl::hlsl::glsl::bitfieldReverse(invID)) / float32_t(~0u); NBL_CONSTEXPR float32_t goldenRatio = 0.6180339887498949f; uint32_t acc = 0u; - uint32_t accPdf = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t u = frac(xi + float32_t(i) * goldenRatio); + xi = frac(xi + goldenRatio); BenchAliasTable::cache_type cache; - uint32_t generated = sampler.generate(u, cache); - acc ^= generated; - accPdf ^= asuint(sampler.forwardPdf(u, cache)); + uint32_t generated = sampler.generate(xi, cache); + acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache)); } - vk::RawBufferStore(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc + accPdf); + vk::RawBufferStore(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc); #else AliasTableTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl index 06aad4fdc..03ac7b36a 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -20,20 +24,24 @@ void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb coefficients by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - const float32_t4 coeffs = float32_t4(0.25f, 0.5f, 0.75f, 1.0f) + perturbation; - sampling::Bilinear sampler = sampling::Bilinear::create(coeffs); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::Bilinear::cache_type cache; - float32_t2 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + const float32_t4 coeffs = float32_t4(0.25f, 0.5f, 0.75f, 1.0f) + perturbation; + sampling::Bilinear sampler = sampling::Bilinear::create(coeffs); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::Bilinear::cache_type cache; + float32_t2 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl index cf0f4065a..6189d4658 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -20,20 +24,24 @@ void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb stddev by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - sampling::BoxMullerTransform sampler = sampling::BoxMullerTransform::create(1.0f + perturbation); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - u.x = max(u.x, 1e-7f); - sampling::BoxMullerTransform::cache_type cache; - float32_t2 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + sampling::BoxMullerTransform sampler = sampling::BoxMullerTransform::create(1.0f + perturbation); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + u.x = max(u.x, 1e-7f); + sampling::BoxMullerTransform::cache_type cache; + float32_t2 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl index 973aba4fe..649c323b2 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -23,13 +27,17 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::ConcentricMapping::cache_type cache; - float32_t2 generated = sampling::ConcentricMapping::generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y); - acc ^= asuint(sampling::ConcentricMapping::forwardPdf(generated, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::ConcentricMapping::cache_type cache; + float32_t2 generated = sampling::ConcentricMapping::generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y); + acc ^= asuint(sampling::ConcentricMapping::forwardPdf(generated, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl index 2e48adc4a..1091ee447 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl @@ -46,10 +46,10 @@ void main() for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t u = frac(xi + float32_t(i) * goldenRatio); + xi = frac(xi + goldenRatio); BenchCumProbSampler::cache_type cache; - uint32_t generated = sampler.generate(u, cache); - acc ^= generated ^ asuint(sampler.forwardPdf(u, cache)); + uint32_t generated = sampler.generate(xi, cache); + acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache)); } vk::RawBufferStore(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc); diff --git a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl index 614f339b4..17cf83ac5 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -20,20 +24,24 @@ void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb coefficients by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - const float32_t2 coeffs = float32_t2(0.2f, 0.8f) + perturbation; - sampling::Linear sampler = sampling::Linear::create(coeffs); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t u = float32_t(rng()) * toFloat; - sampling::Linear::cache_type cache; - float32_t generated = sampler.generate(u, cache); - acc ^= asuint(generated); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + const float32_t2 coeffs = float32_t2(0.2f, 0.8f) + perturbation; + sampling::Linear sampler = sampling::Linear::create(coeffs); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t u = float32_t(rng()) * toFloat; + sampling::Linear::cache_type cache; + float32_t generated = sampler.generate(u, cache); + acc ^= asuint(generated); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl index db7488acd..e0cf7aea0 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -23,13 +27,17 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::PolarMapping::cache_type cache; - float32_t2 generated = sampling::PolarMapping::generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y); - acc ^= asuint(sampling::PolarMapping::forwardPdf(generated, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::PolarMapping::cache_type cache; + float32_t2 generated = sampling::PolarMapping::generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y); + acc ^= asuint(sampling::PolarMapping::forwardPdf(generated, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl index 871444955..d1ef313e5 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -23,14 +27,18 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; sampling::ProjectedHemisphere sampler; - sampling::ProjectedHemisphere::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::ProjectedHemisphere::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl index 67a3fa662..9b8c234c4 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -23,14 +27,18 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t3 u = float32_t3(rng(), rng(), rng()) * toFloat; sampling::ProjectedSphere sampler; - sampling::ProjectedSphere::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t3 u = float32_t3(rng(), rng(), rng()) * toFloat; + sampling::ProjectedSphere::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl index 903075804..ca9b4d43e 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl @@ -11,6 +11,12 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +// Number of generate() calls per create(). Default = BENCH_ITERS (persistent: 1 create total). +// Set to 1 for 1:1, 16 for 1:16 multisampling, etc. Must divide BENCH_ITERS. +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -21,25 +27,49 @@ main() const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS // Perturb rectangle origin by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - shapes::CompressedSphericalRectangle compressed; - compressed.origin = float32_t3(perturbation, perturbation, -2.0f); - compressed.right = float32_t3(1.0f, 0.0f, 0.0f); - compressed.up = float32_t3(0.0f, 1.0f, 0.0f); - shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); - sampling::ProjectedSphericalRectangle sampler = sampling::ProjectedSphericalRectangle::create(rect, float32_t3(perturbation, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, perturbation + 0.5), false); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; +#ifdef BENCH_CREATE_ONLY for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::ProjectedSphericalRectangle::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + // Depend on i so the compiler can't hoist create() out of the loop. + const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f; + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + sampling::ProjectedSphericalRectangle sampler = sampling::ProjectedSphericalRectangle::create(rect, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, perturbation + 0.5), false); + // Read a cheap function of sampler state so create() can't be elided. + sampling::ProjectedSphericalRectangle::cache_type pdfCache; + sampler.generate(float32_t2(0.5f, 0.5f), pdfCache); + acc ^= asuint(sampler.forwardPdf(float32_t2(0.5f, 0.5f), pdfCache)); + } +#else + // Unified create:generate loop — one create per BENCH_SAMPLES_PER_CREATE generates. + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) + { + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + sampling::ProjectedSphericalRectangle sampler = sampling::ProjectedSphericalRectangle::create(rect, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, perturbation + 0.5), false); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::ProjectedSphericalRectangle::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } +#endif benchOutput.Store(invID * 4u, acc); #else ProjectedSphericalRectangleTestExecutor executor; diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl index 83e47b3e1..3d8ec8961 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -20,23 +24,40 @@ void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb vertices and normal by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; - shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); - sampling::ProjectedSphericalTriangle sampler = sampling::ProjectedSphericalTriangle::create(shape, normalize(float32_t3(perturbation, perturbation, 1.0f)), false); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; +#ifdef BENCH_CREATE_ONLY for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::ProjectedSphericalTriangle::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f; + const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; + shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); + sampling::ProjectedSphericalTriangle sampler = sampling::ProjectedSphericalTriangle::create(shape, normalize(float32_t3(perturbation, perturbation, 1.0f)), false); + sampling::ProjectedSphericalTriangle::cache_type pdfCache; + sampler.generate(float32_t2(0.5f, 0.5f), pdfCache); + acc ^= asuint(sampler.forwardPdf(float32_t2(0.5f, 0.5f), pdfCache)); + } +#else + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) + { + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; + shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); + sampling::ProjectedSphericalTriangle sampler = sampling::ProjectedSphericalTriangle::create(shape, normalize(float32_t3(perturbation, perturbation, 1.0f)), false); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::ProjectedSphericalTriangle::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } +#endif benchOutput.Store(invID * 4u, acc); #else ProjectedSphericalTriangleTestExecutor executor; diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl index 3e9a6fcae..b9766d5ff 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl @@ -11,6 +11,12 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +// Number of generate() calls per create(). Default = BENCH_ITERS (persistent: 1 create total). +// Set to 1 for 1:1 (create+generate per iter), 16 for 1:16 multisampling, etc. Must divide BENCH_ITERS. +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -20,26 +26,96 @@ main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb rectangle origin by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - shapes::CompressedSphericalRectangle compressed; - compressed.origin = float32_t3(perturbation, perturbation, -2.0f); - compressed.right = float32_t3(1.0f, 0.0f, 0.0f); - compressed.up = float32_t3(0.0f, 1.0f, 0.0f); - shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); - sampling::SphericalRectangle sampler = sampling::SphericalRectangle::create(rect, float32_t3(perturbation, 0.0f, 0.0f)); + // Observer at origin so origin - observer = (p, p, -2) has no zero components: + // keeps all 4 denorm_n_z components perturbation-dependent (no constant-folding). + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; + +#if (defined(BENCH_VARIANT_SA_EXTENTS) || defined(BENCH_VARIANT_R0_EXTENTS)) && !defined(BENCH_CREATE_ONLY) + // variants 2/3 pre-build: produce a rect (for its basis, sa, extents) once per thread. + shapes::CompressedSphericalRectangle compressedBase; + compressedBase.origin = float32_t3(perturbationBase, perturbationBase, -2.0f); + compressedBase.right = float32_t3(1.0f, 0.0f, 0.0f); + compressedBase.up = float32_t3(0.0f, 1.0f, 0.0f); + const shapes::SphericalRectangle rectBase = shapes::SphericalRectangle::create(compressedBase); + const typename shapes::SphericalRectangle::solid_angle_type saBase = rectBase.solidAngle(float32_t3(0.0f, 0.0f, 0.0f)); + const float32_t2 extentsBase = rectBase.extents; + const matrix basisBase = rectBase.basis; +#endif nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; +#ifdef BENCH_CREATE_ONLY for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::SphericalRectangle::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + // Depend on i so the compiler can't hoist create() out of the loop. + const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f; + sampling::SphericalRectangle sampler; + #if defined(BENCH_VARIANT_SA_EXTENTS) + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + typename shapes::SphericalRectangle::solid_angle_type sa = rect.solidAngle(float32_t3(0.0f, 0.0f, 0.0f)); + sampler = sampling::SphericalRectangle::create(rect.basis, sa, rect.extents); + #elif defined(BENCH_VARIANT_R0_EXTENTS) + // Build a basis from the same rect geometry so create(basis, r0, extents) has the right frame. + shapes::CompressedSphericalRectangle compressedR0; + compressedR0.origin = float32_t3(perturbation, perturbation, -2.0f); + compressedR0.right = float32_t3(1.0f, 0.0f, 0.0f); + compressedR0.up = float32_t3(0.0f, 1.0f, 0.0f); + const shapes::SphericalRectangle rectR0 = shapes::SphericalRectangle::create(compressedR0); + const float32_t3 r0 = float32_t3(perturbation, perturbation, -2.0f); + const float32_t2 extents = float32_t2(1.0f, 1.0f); + sampler = sampling::SphericalRectangle::create(rectR0.basis, r0, extents); + #else + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + sampler = sampling::SphericalRectangle::create(rect, float32_t3(0.0f, 0.0f, 0.0f)); + #endif + // Read a cheap function of sampler state so create() can't be elided. + acc ^= asuint(sampler.backwardPdf(float32_t3(0.0f, 0.0f, 1.0f))); } +#else + // Unified create:generate loop - one create per BENCH_SAMPLES_PER_CREATE generates. + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) + { + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + sampling::SphericalRectangle sampler; + #if defined(BENCH_VARIANT_SA_EXTENTS) + // variant 2: create(basis, sa, extents). Poison one cosGamma so the sincos_accumulator can't be hoisted. + typename shapes::SphericalRectangle::solid_angle_type sa = saBase; + sa.cosGamma[2] += perturbation; + sampler = sampling::SphericalRectangle::create(basisBase, sa, extentsBase); + #elif defined(BENCH_VARIANT_R0_EXTENTS) + // variant 3: create(basis, r0, extents). r0 matches what variant 1 produces. + const float32_t3 r0 = float32_t3(perturbation, perturbation, -2.0f); + const float32_t2 extents = float32_t2(1.0f, 1.0f); + sampler = sampling::SphericalRectangle::create(basisBase, r0, extents); + #else + // variant 1 (default): create(shape, observer). + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + sampler = sampling::SphericalRectangle::create(rect, float32_t3(0.0f, 0.0f, 0.0f)); + #endif + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::SphericalRectangle::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } + } +#endif benchOutput.Store(invID * 4u, acc); #else SphericalRectangleTestExecutor executor; diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl index 55991bcb3..3595ac86a 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl @@ -11,32 +11,50 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb vertices by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; - shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); - sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; +#ifdef BENCH_CREATE_ONLY for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::SphericalTriangle::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f; + const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; + shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); + sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); + acc ^= asuint(sampler.backwardPdf(float32_t3(0.0f, 0.0f, 1.0f))); + } +#else + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) + { + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; + shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); + sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::SphericalTriangle::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } +#endif benchOutput.Store(invID * 4u, acc); #else SphericalTriangleTestExecutor executor; diff --git a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl index 908520243..cd43c630e 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl @@ -1,4 +1,8 @@ +#pragma shader_stage(compute) + // Compile test: instantiate all sampling types and their concept-required methods to verify DXC compilation +#include +#include #include #include #include @@ -9,12 +13,15 @@ #include #include #include +#include +#include +#include +#include "../common/array_accessor.hlsl" using namespace nbl::hlsl; [[vk::binding(0, 0)]] RWStructuredBuffer output; [numthreads(1, 1, 1)] -[shader("compute")] void main() { float32_t2 u2 = float32_t2(0.5, 0.5); @@ -119,7 +126,7 @@ void main() // Octant triangle: all dot products between vertices are 0, so cos_sides=0, csc_sides=1 const float32_t3 triVerts[3] = {float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1)}; shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::createFromUnitSphereVertices(triVerts); - sampling::SphericalTriangle sphTri = sampling::SphericalTriangle::create(shapeTri); + sampling::SphericalTriangle sphTri = sampling::SphericalTriangle::create(shapeTri); sampling::SphericalTriangle::cache_type sphTriCache; float32_t3 stSample = sphTri.generate(u2, sphTriCache); acc.xyz += stSample; @@ -129,7 +136,7 @@ void main() acc.x += sphTri.backwardPdf(stSample); acc.x += sphTri.backwardWeight(stSample); - // SphericalRectangle — generate, forwardPdf, backwardPdf, forwardWeight, backwardWeight + // SphericalRectangle — generate, generateSurfaceOffset, forwardPdf, backwardPdf, forwardWeight, backwardWeight shapes::CompressedSphericalRectangle csr; csr.origin = float32_t3(0.0, 0.0, -1.0); csr.right = float32_t3(1.0, 0.0, 0.0); @@ -140,20 +147,81 @@ void main() sampling::SphericalRectangle::cache_type sphRectCache; float32_t3 srSample = sphRect.generate(u2, sphRectCache); acc.xyz += srSample; + acc.xy += sphRect.generateLocalBasisXY(u2, sphRectCache); acc.x += sphRect.forwardPdf(u2, sphRectCache); acc.x += sphRect.forwardWeight(u2, sphRectCache); acc.x += sphRect.backwardPdf(srSample); acc.x += sphRect.backwardWeight(srSample); - // ProjectedSphericalTriangle — generate, forwardPdf, backwardPdf, forwardWeight, backwardWeight + // ProjectedSphericalTriangle — generate, forwardPdf, forwardWeight, backwardWeight(L) sampling::ProjectedSphericalTriangle projTri = sampling::ProjectedSphericalTriangle::create(shapeTri, float32_t3(0.0, 0.0, 1.0), false); sampling::ProjectedSphericalTriangle::cache_type projTriCache; float32_t3 ptSample = projTri.generate(u2, projTriCache); acc.xyz += ptSample; acc.x += projTri.forwardPdf(u2, projTriCache); acc.x += projTri.forwardWeight(u2, projTriCache); - acc.x += projTri.backwardPdf(ptSample); acc.x += projTri.backwardWeight(ptSample); + // ProjectedSphericalRectangle (UsePdfAsWeight=true) — generate, forwardPdf, forwardWeight, backwardWeight(L) + const float32_t3 psrNormal = float32_t3(0.0, 0.0, 1.0); + sampling::ProjectedSphericalRectangle projRectPdf = + sampling::ProjectedSphericalRectangle::create(shapeRect, srObserver, psrNormal, false); + sampling::ProjectedSphericalRectangle::cache_type projRectPdfCache; + float32_t3 prPdfSample = projRectPdf.generate(u2, projRectPdfCache); + acc.xyz += prPdfSample; + acc.x += projRectPdf.forwardPdf(u2, projRectPdfCache); + acc.x += projRectPdf.forwardWeight(u2, projRectPdfCache); + acc.x += projRectPdf.backwardWeight(prPdfSample); + + // ProjectedSphericalRectangle (UsePdfAsWeight=false) — exercise the MIS-weight path + sampling::ProjectedSphericalRectangle projRectMis = + sampling::ProjectedSphericalRectangle::create(shapeRect, srObserver, psrNormal, true); + sampling::ProjectedSphericalRectangle::cache_type projRectMisCache; + float32_t3 prMisSample = projRectMis.generate(u2, projRectMisCache); + acc.xyz += prMisSample; + acc.x += projRectMis.forwardPdf(u2, projRectMisCache); + acc.x += projRectMis.forwardWeight(u2, projRectMisCache); + acc.x += projRectMis.backwardWeight(prMisSample); + + // AliasTable — generate (with/without cache), forwardPdf, backwardPdf, forwardWeight, backwardWeight + ArrayAccessor aliasProb; + aliasProb.data[0] = 0.25; aliasProb.data[1] = 0.5; aliasProb.data[2] = 0.75; aliasProb.data[3] = 1.0; + ArrayAccessor aliasIdx; + aliasIdx.data[0] = 1u; aliasIdx.data[1] = 2u; aliasIdx.data[2] = 3u; aliasIdx.data[3] = 0u; + ArrayAccessor aliasPdf; + aliasPdf.data[0] = 0.25; aliasPdf.data[1] = 0.25; aliasPdf.data[2] = 0.25; aliasPdf.data[3] = 0.25; + sampling::AliasTable, ArrayAccessor, ArrayAccessor > aliasTable = + sampling::AliasTable, ArrayAccessor, ArrayAccessor >::create(aliasProb, aliasIdx, aliasPdf, 4u); + sampling::AliasTable, ArrayAccessor, ArrayAccessor >::cache_type aliasCache; + uint32_t aliasBin0 = aliasTable.generate(0.3); + uint32_t aliasBin = aliasTable.generate(0.3, aliasCache); + acc.x += float32_t(aliasBin0 + aliasBin); + acc.x += aliasTable.forwardPdf(0.3, aliasCache); + acc.x += aliasTable.forwardWeight(0.3, aliasCache); + acc.x += aliasTable.backwardPdf(aliasBin); + acc.x += aliasTable.backwardWeight(aliasBin); + + // CumulativeProbabilitySampler — generate (with/without cache), forwardPdf, backwardPdf, forwardWeight, backwardWeight + ArrayAccessor cumProb; + cumProb.data[0] = 0.25; cumProb.data[1] = 0.5; cumProb.data[2] = 0.75; + sampling::CumulativeProbabilitySampler > cumSampler = + sampling::CumulativeProbabilitySampler >::create(cumProb, 4u); + sampling::CumulativeProbabilitySampler >::cache_type cumCache; + uint32_t cumBin0 = cumSampler.generate(0.6); + uint32_t cumBin = cumSampler.generate(0.6, cumCache); + acc.x += float32_t(cumBin0 + cumBin); + acc.x += cumSampler.forwardPdf(0.6, cumCache); + acc.x += cumSampler.forwardWeight(0.6, cumCache); + acc.x += cumSampler.backwardPdf(cumBin); + acc.x += cumSampler.backwardWeight(cumBin); + + // PartitionRandVariable — operator() partitions u into a left/right branch + sampling::PartitionRandVariable partition; + partition.leftProb = 0.25; + float32_t partXi = 0.5; + float32_t partRcp; + bool partRight = partition(partXi, partRcp); + acc.x += partXi + partRcp + float32_t(partRight ? 1 : 0); + output[0] = acc; } diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl index d0990ef43..3c43ee119 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -23,14 +27,18 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; sampling::UniformHemisphere sampler; - sampling::UniformHemisphere::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::UniformHemisphere::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl index 0d33f5c11..5879e28bb 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -23,14 +27,18 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; sampling::UniformSphere sampler; - sampling::UniformSphere::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::UniformSphere::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h index 8f85545b3..02fbf58d2 100644 --- a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h @@ -12,8 +12,11 @@ using namespace nbl; // Benchmarks alias table vs cumulative probability sampler on the GPU using BDA. -// Builds both tables from the same weight distribution, uploads via BDA buffers, -// and measures GPU throughput using timestamp queries. +// Builds pipelines once, then sweeps a list of table sizes. For each N it builds +// both tables from the same weight distribution, uploads via BDA buffers, and +// measures GPU throughput using timestamp queries. The cumulative probability +// sampler is run in two variants: the stateful-comparator cache population +// (default) and the "YOLO re-read" variant (cumulative_probability.hlsl). class CDiscreteSamplerBenchmark { public: @@ -26,17 +29,17 @@ class CDiscreteSamplerBenchmark video::IPhysicalDevice* physicalDevice; std::string aliasShaderKey; std::string cumProbShaderKey; + std::string cumProbYoloShaderKey; uint32_t computeFamilyIndex; uint32_t dispatchGroupCount; - uint32_t tableSize; }; void setup(const SetupData& data) { m_device = data.device; m_logger = data.logger; + m_assetMgr = data.assetMgr; m_dispatchGroupCount = data.dispatchGroupCount; - m_tableSize = data.tableSize; m_physicalDevice = data.physicalDevice; m_queue = m_device->getQueue(data.computeFamilyIndex, 0); @@ -44,8 +47,6 @@ class CDiscreteSamplerBenchmark // Command pool + buffers m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchCmdbuf); - m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdbuf); - m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdbuf); // Timestamp query pool { @@ -56,61 +57,9 @@ class CDiscreteSamplerBenchmark m_queryPool = m_device->createQueryPool(qp); } - // Generate random weights - const uint32_t N = m_tableSize; - std::vector weights(N); - std::mt19937 rng(42); - std::uniform_real_distribution dist(0.001f, 100.0f); - for (uint32_t i = 0; i < N; i++) - weights[i] = dist(rng); - - // Build alias table - std::vector aliasProb(N); - std::vector aliasIdx(N); - std::vector aliasPdf(N); - std::vector workspace(N); - nbl::hlsl::sampling::AliasTableBuilder::build({weights}, aliasProb.data(), aliasIdx.data(), aliasPdf.data(), workspace.data()); - - // Build cumulative probability table - std::vector cumProb(N - 1); - nbl::hlsl::sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data()); - - // Create BDA buffers and upload data - auto createBdaBuffer = [&](const void* srcData, size_t bytes) -> core::smart_refctd_ptr - { - video::IGPUBuffer::SCreationParams bp = {}; - bp.size = bytes; - bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | - video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - auto buf = m_device->createBuffer(std::move(bp)); - - video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits(); - auto alloc = m_device->allocate(reqs, buf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - - const auto allocSize = alloc.memory->getAllocationSize(); - if (alloc.memory->map({0ull, allocSize}, video::IDeviceMemoryAllocation::EMCAF_WRITE)) - { - std::memcpy(alloc.memory->getMappedPointer(), srcData, bytes); - // Flush so GPU can see the written data - video::ILogicalDevice::MappedMemoryRange flushRange(alloc.memory.get(), 0ull, allocSize); - m_device->flushMappedMemoryRanges(1u, &flushRange); - alloc.memory->unmap(); - } - return buf; - }; - const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE; - // Alias table buffers - m_aliasProbBuf = createBdaBuffer(aliasProb.data(), N * sizeof(float)); - m_aliasIdxBuf = createBdaBuffer(aliasIdx.data(), N * sizeof(uint32_t)); - m_aliasPdfBuf = createBdaBuffer(aliasPdf.data(), N * sizeof(float)); - - // CDF buffer - m_cumProbBuf = createBdaBuffer(cumProb.data(), (N - 1) * sizeof(float)); - - // Shared output buffer + // Shared output buffer (size only depends on thread count) { video::IGPUBuffer::SCreationParams bp = {}; bp.size = totalThreads * sizeof(uint32_t); @@ -122,163 +71,218 @@ class CDiscreteSamplerBenchmark m_device->allocate(reqs, m_outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); } - // Create pipelines (push constants only, no descriptor sets) - auto loadShader = [&](const std::string& key) + // Pipelines (N-independent; only push constants change per run) + m_aliasPipeline = createPipeline(data.aliasShaderKey, m_aliasPplnLayout, "alias"); + m_cumProbPipeline = createPipeline(data.cumProbShaderKey, m_cumProbPplnLayout, "cumprob-comparator"); + m_cumProbYoloPipeline = createPipeline(data.cumProbYoloShaderKey, m_cumProbYoloPplnLayout, "cumprob-yolo"); + } + + // DispatchScheduler: uint32_t N -> std::pair. + // Lets the caller trade wall-clock for statistical stability per size: + // big-N runs are DRAM-bound and need fewer dispatches to hit the same total sample count. + struct DispatchCounts { uint32_t warmup; uint32_t bench; }; + + // Sweep a list of table sizes. For each N: build tables from a fresh weight + // distribution (deterministic seed = 42 + N so different N's get distinct + // distributions but runs are reproducible), upload via BDA, then run all + // three samplers with the dispatch counts chosen by `scheduler`. + template + void runSweep(const std::vector& tableSizes, DispatchScheduler scheduler) + { + const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE; + m_logger->log("=== GPU Discrete Sampler Benchmark sweep (%u threads * %u iters/thread; wg=%u; dispatches chosen per-N) ===", + system::ILogger::ELL_PERFORMANCE, totalThreads, BENCH_ITERS, WORKGROUP_SIZE); + m_logger->log("%12s | %-28s | %12s | %12s | %12s | %10s", + system::ILogger::ELL_PERFORMANCE, "N", "Sampler", "ps/sample", "GSamples/s", "ms total", "dispatches"); + + for (uint32_t N : tableSizes) + { + const DispatchCounts dc = scheduler(N); + buildAndUpload(N); + runSingle(N, "AliasTable", m_aliasPipeline, m_aliasPplnLayout, SamplerKind::Alias, dc.warmup, dc.bench); + runSingle(N, "CumulativeProbability", m_cumProbPipeline, m_cumProbPplnLayout, SamplerKind::CumProbCompare, dc.warmup, dc.bench); + runSingle(N, "CumulativeProbability (YOLO)", m_cumProbYoloPipeline, m_cumProbYoloPplnLayout, SamplerKind::CumProbYolo, dc.warmup, dc.bench); + releaseTables(); + } + } + + // Convenience: sweep with fixed dispatch counts for every size. + void runSweep(const std::vector& tableSizes, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) + { + runSweep(tableSizes, [warmupIterations, benchmarkIterations](uint32_t) -> DispatchCounts { + return {warmupIterations, benchmarkIterations}; + }); + } + + private: + enum class SamplerKind { Alias, CumProbCompare, CumProbYolo }; + + template + core::smart_refctd_ptr createPipeline(const std::string& shaderKey, core::smart_refctd_ptr& outLayout, const char* tag) + { + const asset::SPushConstantRange pcRange = { + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, + .offset = 0, + .size = sizeof(PushConstantT)}; + auto layout = m_device->createPipelineLayout({&pcRange, 1}); + if (!layout) + m_logger->log("CDiscreteSamplerBenchmark: failed to create %s pipeline layout", system::ILogger::ELL_ERROR, tag); + + asset::IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; + auto bundle = m_assetMgr->getAsset(shaderKey, lp); + auto source = asset::IAsset::castDown(bundle.getContents()[0]); + auto shader = m_device->compileShader({.source = source.get()}); + if (!shader) + m_logger->log("CDiscreteSamplerBenchmark: failed to load %s shader", system::ILogger::ELL_ERROR, tag); + + video::IGPUComputePipeline::SCreationParams pp = {}; + pp.layout = layout.get(); + pp.shader.shader = shader.get(); + pp.shader.entryPoint = "main"; + if (m_device->getEnabledFeatures().pipelineExecutableInfo) { - asset::IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; - auto bundle = data.assetMgr->getAsset(key, lp); - auto source = asset::IAsset::castDown(bundle.getContents()[0]); - return m_device->compileShader({.source = source.get()}); - }; - - // Alias table pipeline + pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; + } + + core::smart_refctd_ptr pipeline; + if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &pipeline)) + m_logger->log("CDiscreteSamplerBenchmark: failed to create %s compute pipeline", system::ILogger::ELL_ERROR, tag); + + if (m_device->getEnabledFeatures().pipelineExecutableInfo) { - const asset::SPushConstantRange pcRange = { - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, - .offset = 0, - .size = sizeof(AliasTablePushConstants)}; - auto layout = m_device->createPipelineLayout({&pcRange, 1}); - if (!layout) - m_logger->log("CDiscreteSamplerBenchmark: failed to create alias pipeline layout", system::ILogger::ELL_ERROR); - video::IGPUComputePipeline::SCreationParams pp = {}; - pp.layout = layout.get(); - auto shader = loadShader(data.aliasShaderKey); - if (!shader) - m_logger->log("CDiscreteSamplerBenchmark: failed to load alias shader", system::ILogger::ELL_ERROR); - pp.shader.shader = shader.get(); - pp.shader.entryPoint = "main"; - - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; - } - - if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &m_aliasPipeline)) - m_logger->log("CDiscreteSamplerBenchmark: failed to create alias compute pipeline", system::ILogger::ELL_ERROR); - - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - auto report = system::to_string(m_aliasPipeline->getExecutableInfo()); - m_logger->log("Alias Table Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, report.c_str()); - } - m_aliasPplnLayout = std::move(layout); + auto report = system::to_string(pipeline->getExecutableInfo()); + m_logger->log("%s Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, tag, report.c_str()); } + outLayout = std::move(layout); + return pipeline; + } - // CDF pipeline + core::smart_refctd_ptr createBdaBuffer(const void* srcData, size_t bytes) + { + video::IGPUBuffer::SCreationParams bp = {}; + bp.size = bytes; + bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | + video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + auto buf = m_device->createBuffer(std::move(bp)); + + video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buf->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); + auto alloc = m_device->allocate(reqs, buf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + + const auto allocSize = alloc.memory->getAllocationSize(); + if (alloc.memory->map({0ull, allocSize}, video::IDeviceMemoryAllocation::EMCAF_WRITE)) { - const asset::SPushConstantRange pcRange = { - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, - .offset = 0, - .size = sizeof(CumProbPushConstants)}; - auto layout = m_device->createPipelineLayout({&pcRange, 1}); - if (!layout) - m_logger->log("CDiscreteSamplerBenchmark: failed to create cumprob pipeline layout", system::ILogger::ELL_ERROR); - video::IGPUComputePipeline::SCreationParams pp = {}; - pp.layout = layout.get(); - auto shader = loadShader(data.cumProbShaderKey); - if (!shader) - m_logger->log("CDiscreteSamplerBenchmark: failed to load cumprob shader", system::ILogger::ELL_ERROR); - pp.shader.shader = shader.get(); - pp.shader.entryPoint = "main"; - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; - } - if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &m_cumProbPipeline)) - m_logger->log("CDiscreteSamplerBenchmark: failed to create cumprob compute pipeline", system::ILogger::ELL_ERROR); - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - auto report = system::to_string(m_cumProbPipeline->getExecutableInfo()); - m_logger->log("Cumulative Probability Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, report.c_str()); - } - m_cumProbPplnLayout = std::move(layout); + std::memcpy(alloc.memory->getMappedPointer(), srcData, bytes); + video::ILogicalDevice::MappedMemoryRange flushRange(alloc.memory.get(), 0ull, allocSize); + m_device->flushMappedMemoryRanges(1u, &flushRange); + alloc.memory->unmap(); } + return buf; } - void run(uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) + void buildAndUpload(uint32_t N) { - constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; - const uint32_t totalThreads = m_dispatchGroupCount * benchWorkgroupSize; - m_logger->log("=== GPU Discrete Sampler Benchmark (N=%u, %u dispatches, %u threads/dispatch, %u iters/thread, ps/sample is per all GPU threads) ===", - system::ILogger::ELL_PERFORMANCE, m_tableSize, benchmarkIterations, totalThreads, BENCH_ITERS); + m_currentN = N; + + std::vector weights(N); + std::mt19937 rng(42u + N); + std::uniform_real_distribution dist(0.001f, 100.0f); + for (uint32_t i = 0; i < N; i++) + weights[i] = dist(rng); + + // Alias table + std::vector aliasProb(N); + std::vector aliasIdx(N); + std::vector aliasPdf(N); + std::vector workspace(N); + nbl::hlsl::sampling::AliasTableBuilder::build({weights}, aliasProb.data(), aliasIdx.data(), aliasPdf.data(), workspace.data()); + + // Cumulative probability (N-1 entries, last bucket implicitly 1.0) + std::vector cumProb(N > 0 ? N - 1 : 0); + nbl::hlsl::sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data()); - runSingle("AliasTable", m_aliasPipeline, m_aliasPplnLayout, true, warmupIterations, benchmarkIterations); - runSingle("CumulativeProbability", m_cumProbPipeline, m_cumProbPplnLayout, false, warmupIterations, benchmarkIterations); + m_aliasProbBuf = createBdaBuffer(aliasProb.data(), N * sizeof(float)); + m_aliasIdxBuf = createBdaBuffer(aliasIdx.data(), N * sizeof(uint32_t)); + m_aliasPdfBuf = createBdaBuffer(aliasPdf.data(), N * sizeof(float)); + const size_t cumProbBytes = (N > 0 ? (N - 1) : 0) * sizeof(float); + m_cumProbBuf = cumProbBytes ? createBdaBuffer(cumProb.data(), cumProbBytes) : nullptr; } - private: - void runSingle(const char* name, const core::smart_refctd_ptr& pipeline, const core::smart_refctd_ptr& layout, bool isAlias, uint32_t warmupIterations, uint32_t benchmarkIterations) + void releaseTables() + { + m_aliasProbBuf = nullptr; + m_aliasIdxBuf = nullptr; + m_aliasPdfBuf = nullptr; + m_cumProbBuf = nullptr; + } + + void runSingle( + uint32_t N, + const char* name, + const core::smart_refctd_ptr& pipeline, + const core::smart_refctd_ptr& layout, + SamplerKind kind, + uint32_t warmupIterations, + uint32_t benchmarkIterations) { m_device->waitIdle(); - // Record benchmark command buffer + // Everything (warmup, timestamped bench, cooldown) goes into ONE cmdbuf and ONE + // submit. Serial submissions with semaphore waits between them would add sync cost + // to every dispatch and prevent the driver from overlapping adjacent dispatches. + // With a single cmdbuf the driver pipelines freely, and GPU memory latency is + // hidden by warp hyperthreading rather than by cross-submit overlap. + // + // Layout: [warmup dispatches] [ts 0] [bench dispatches] [ts 1] [cooldown dispatches] + // Warmup brings clocks + caches to steady state before ts 0. Cooldown keeps the + // same steady-state context alive across ts 1 so the trailing bench dispatches + // don't measure a tail where the GPU is already winding down. + const uint32_t cooldownIterations = warmupIterations; + m_benchCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_benchCmdbuf->begin(video::IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); + m_benchCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_benchCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); m_benchCmdbuf->bindComputePipeline(pipeline.get()); - if (isAlias) + if (kind == SamplerKind::Alias) { AliasTablePushConstants pc = {}; - pc.probAddress = m_aliasProbBuf->getDeviceAddress(); + pc.probAddress = m_aliasProbBuf->getDeviceAddress(); pc.aliasAddress = m_aliasIdxBuf->getDeviceAddress(); - pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress(); + pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress(); pc.outputAddress = m_outputBuf->getDeviceAddress(); - pc.tableSize = m_tableSize; + pc.tableSize = N; m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); } else { CumProbPushConstants pc = {}; - pc.cumProbAddress = m_cumProbBuf->getDeviceAddress(); - pc.outputAddress = m_outputBuf->getDeviceAddress(); - pc.tableSize = m_tableSize; + pc.cumProbAddress = m_cumProbBuf ? m_cumProbBuf->getDeviceAddress() : 0ull; + pc.outputAddress = m_outputBuf->getDeviceAddress(); + pc.tableSize = N; m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); } - m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); + for (uint32_t i = 0u; i < warmupIterations; ++i) + m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); + m_benchCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); + for (uint32_t i = 0u; i < benchmarkIterations; ++i) + m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); + m_benchCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + for (uint32_t i = 0u; i < cooldownIterations; ++i) + m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); m_benchCmdbuf->end(); - // Record timestamp command buffers - m_timestampBeforeCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampBeforeCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampBeforeCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); - m_timestampBeforeCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0); - m_timestampBeforeCmdbuf->end(); - - m_timestampAfterCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampAfterCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampAfterCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1); - m_timestampAfterCmdbuf->end(); - auto semaphore = m_device->createSemaphore(0u); - uint64_t semCounter = 0u; - const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_benchCmdbuf.get()}}; - const video::IQueue::SSubmitInfo::SCommandBufferInfo beforeCmds[] = {{.cmdbuf = m_timestampBeforeCmdbuf.get()}}; - const video::IQueue::SSubmitInfo::SCommandBufferInfo afterCmds[] = {{.cmdbuf = m_timestampAfterCmdbuf.get()}}; - - auto submitSerial = [&](const video::IQueue::SSubmitInfo::SCommandBufferInfo* cmds, uint32_t count) - { - const video::IQueue::SSubmitInfo::SSemaphoreInfo waitSem[] = { - {.semaphore = semaphore.get(), .value = semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { - {.semaphore = semaphore.get(), .value = ++semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - video::IQueue::SSubmitInfo submit = {}; - submit.commandBuffers = {cmds, count}; - submit.waitSemaphores = waitSem; - submit.signalSemaphores = signalSem; - m_queue->submit({&submit, 1u}); - }; - - for (uint32_t i = 0u; i < warmupIterations; ++i) - submitSerial(benchCmds, 1u); - - submitSerial(beforeCmds, 1u); - for (uint32_t i = 0u; i < benchmarkIterations; ++i) - submitSerial(benchCmds, 1u); - submitSerial(afterCmds, 1u); + const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { + {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; + video::IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = benchCmds; + submit.signalSemaphores = signalSem; + m_queue->submit({&submit, 1u}); m_device->waitIdle(); @@ -288,36 +292,37 @@ class CDiscreteSamplerBenchmark m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags); constexpr uint32_t benchIters = BENCH_ITERS; - constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); const float64_t elapsed_ns = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod; - const uint64_t totalThreads = uint64_t(m_dispatchGroupCount) * uint64_t(benchWorkgroupSize); + const uint64_t totalThreads = uint64_t(m_dispatchGroupCount) * uint64_t(WORKGROUP_SIZE); const uint64_t totalSamples = uint64_t(benchmarkIterations) * totalThreads * uint64_t(benchIters); const float64_t ps_per_sample = elapsed_ns * 1e3 / float64_t(totalSamples); const float64_t gsamples_per_s = float64_t(totalSamples) / elapsed_ns; const float64_t elapsed_ms = elapsed_ns * 1e-6; - m_logger->log("[Benchmark] %-28s: %9.3f ps/sample | %10.3f GSamples/s | %10.3f ms total", system::ILogger::ELL_PERFORMANCE, name, ps_per_sample, gsamples_per_s, elapsed_ms); + m_logger->log("%12u | %-28s | %12.3f | %12.3f | %12.3f | %10u", + system::ILogger::ELL_PERFORMANCE, N, name, ps_per_sample, gsamples_per_s, elapsed_ms, benchmarkIterations); } core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_logger; + core::smart_refctd_ptr m_assetMgr; core::smart_refctd_ptr m_cmdpool; core::smart_refctd_ptr m_benchCmdbuf; - core::smart_refctd_ptr m_timestampBeforeCmdbuf; - core::smart_refctd_ptr m_timestampAfterCmdbuf; core::smart_refctd_ptr m_queryPool; - // Alias table + // Pipelines (set up once) core::smart_refctd_ptr m_aliasPplnLayout; core::smart_refctd_ptr m_aliasPipeline; + core::smart_refctd_ptr m_cumProbPplnLayout; + core::smart_refctd_ptr m_cumProbPipeline; + core::smart_refctd_ptr m_cumProbYoloPplnLayout; + core::smart_refctd_ptr m_cumProbYoloPipeline; + + // Per-N data buffers (rebuilt each sweep step) core::smart_refctd_ptr m_aliasProbBuf; core::smart_refctd_ptr m_aliasIdxBuf; core::smart_refctd_ptr m_aliasPdfBuf; - - // Cumulative probability - core::smart_refctd_ptr m_cumProbPplnLayout; - core::smart_refctd_ptr m_cumProbPipeline; core::smart_refctd_ptr m_cumProbBuf; // Shared @@ -325,7 +330,7 @@ class CDiscreteSamplerBenchmark video::IQueue* m_queue = nullptr; video::IPhysicalDevice* m_physicalDevice = nullptr; uint32_t m_dispatchGroupCount = 0; - uint32_t m_tableSize = 0; + uint32_t m_currentN = 0; }; #endif diff --git a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h index 3e2092670..9f9854ac5 100644 --- a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h @@ -162,7 +162,7 @@ class CSamplerBenchmark } // Runs warmupIterations submits (unclocked), then benchmarkIterations submits under GPU timestamps. - void run(const std::string& samplerName, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) + void run(const std::string& samplerName, const std::string& mode, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) { m_device->waitIdle(); recordBenchmarkCmdBuf(); @@ -213,9 +213,9 @@ class CSamplerBenchmark const float64_t gsamples_per_s = float64_t(total_samples) / elapsed_ns; const float64_t elapsed_ms = elapsed_ns * 1e-6; - m_logger->log("[Benchmark] %-28s: %9.3f ps/sample | %10.3f GSamples/s | %10.3f ms total", + m_logger->log("[Benchmark] %-28s | %-38s | %12.3f | %12.3f | %12.3f", system::ILogger::ELL_PERFORMANCE, - samplerName.c_str(), ps_per_sample, gsamples_per_s, elapsed_ms); + samplerName.c_str(), mode.c_str(), ps_per_sample, gsamples_per_s, elapsed_ms); } private: diff --git a/37_HLSLSamplingTests/main.cpp b/37_HLSLSamplingTests/main.cpp index 98ea127cc..470132aba 100644 --- a/37_HLSLSamplingTests/main.cpp +++ b/37_HLSLSamplingTests/main.cpp @@ -51,12 +51,11 @@ using namespace nbl::examples; #include "benchmarks/CDiscreteSamplerBenchmark.h" #include "tests/property/CSamplerPropertyTester.h" -constexpr bool DoBenchmark = true; class HLSLSamplingTests final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication { using device_base_t = application_templates::MonoDeviceApplication; - using asset_base_t = BuiltinResourcesApplication; + using asset_base_t = BuiltinResourcesApplication; public: HLSLSamplingTests(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) @@ -64,7 +63,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat virtual SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override { - auto retval = device_base_t::getPreferredDeviceFeatures(); + auto retval = device_base_t::getPreferredDeviceFeatures(); retval.pipelineExecutableInfo = true; return retval; } @@ -80,10 +79,10 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // test compile with dxc { IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; - auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get()); - auto bundle = m_assetMgr->getAsset(key.c_str(), lp); + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; + auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get()); + auto bundle = m_assetMgr->getAsset(key.c_str(), lp); const auto assets = bundle.getContents(); if (assets.empty()) @@ -155,8 +154,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat static_assert(sampling::concepts::BackwardTractableSampler>); static_assert(sampling::concepts::BackwardTractableSampler>); static_assert(sampling::concepts::BackwardTractableSampler>); - static_assert(sampling::concepts::BackwardTractableSampler>); - static_assert(sampling::concepts::BackwardTractableSampler>); + //static_assert(sampling::concepts::BackwardTractableSampler>); // no backwardPdf + //static_assert(sampling::concepts::BackwardTractableSampler>); // no backwardPdf static_assert(sampling::concepts::BackwardTractableSampler>); static_assert(sampling::concepts::BackwardTractableSampler>); static_assert(sampling::concepts::BackwardTractableSampler>); @@ -166,7 +165,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat static_assert(sampling::concepts::BijectiveSampler>); static_assert(sampling::concepts::BijectiveSampler>); static_assert(sampling::concepts::BijectiveSampler>); - static_assert(sampling::concepts::BijectiveSampler>); + static_assert(sampling::concepts::BijectiveSampler>); static_assert(sampling::concepts::BijectiveSampler>); static_assert(sampling::concepts::BijectiveSampler>); @@ -180,89 +179,162 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // ====================================================================== // GPU throughput benchmarks // ====================================================================== - const uint32_t testBatchCount = 1024; + // 4096 workgroups * WORKGROUP_SIZE(64) = 256k invocations per dispatch — enough + // to saturate a 3080 (68 SMs * ~1536 resident invocations) so memory latency is + // hidden by hyperthreading rather than by cross-dispatch overlap. + constexpr uint32_t testBatchCount = 4096; + constexpr bool DoBenchmark = true; if constexpr (DoBenchmark) { - constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; + constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; constexpr uint32_t totalThreadsPerDispatch = testBatchCount * benchWorkgroupSize; - constexpr uint32_t iterationsPerThread = BENCH_ITERS; + constexpr uint32_t iterationsPerThread = BENCH_ITERS; constexpr uint32_t benchSamplesPerDispatch = totalThreadsPerDispatch * iterationsPerThread; struct BenchEntry { CSamplerBenchmark bench; - std::string name; + std::string sampler; + std::string mode; }; std::vector benchmarks; - auto addBench = [&](const char* name, const std::string& shaderKey, size_t inputSize, size_t outputSize) + auto addBench = [&](const char* sampler, const char* mode, const std::string& shaderKey, size_t inputSize, size_t outputSize) { - auto& entry = benchmarks.emplace_back(); - entry.name = name; + auto& entry = benchmarks.emplace_back(); + entry.sampler = sampler; + entry.mode = mode; CSamplerBenchmark::SetupData data; - data.device = m_device; - data.api = m_api; - data.assetMgr = m_assetMgr; - data.logger = m_logger; - data.physicalDevice = m_physicalDevice; + data.device = m_device; + data.api = m_api; + data.assetMgr = m_assetMgr; + data.logger = m_logger; + data.physicalDevice = m_physicalDevice; data.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - data.shaderKey = shaderKey; + data.shaderKey = shaderKey; data.dispatchGroupCount = testBatchCount; data.samplesPerDispatch = benchSamplesPerDispatch; - data.inputBufferBytes = inputSize; - data.outputBufferBytes = outputSize; + data.inputBufferBytes = inputSize; + data.outputBufferBytes = outputSize; entry.bench.setup(data); }; // Bench shaders don't read input (hardcoded values) and write a single uint32_t per thread via RWByteAddressBuffer - constexpr size_t benchInputBytes = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks + constexpr size_t benchInputBytes = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks constexpr size_t benchOutputBytes = sizeof(uint32_t) * totalThreadsPerDispatch; - addBench("Linear", nbl::this_example::builtin::build::get_spirv_key<"linear_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("Bilinear", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("BoxMullerTransform", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("UniformHemisphere", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("UniformSphere", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ConcentricMapping", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("PolarMapping", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedHemisphere", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphere", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalRectangle", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphericalRectangle", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalTriangle", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphericalTriangle", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("Linear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("Linear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("Bilinear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("Bilinear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("BoxMullerTransform", "1:1", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("BoxMullerTransform", "1:16", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("UniformHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("UniformHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("UniformSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("UniformSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ConcentricMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ConcentricMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("PolarMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("PolarMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "1:1 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "1:16 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "1:1 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "1:16 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "1:1 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "1:16 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "create-only (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "create-only (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "create-only (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphericalRectangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphericalRectangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphericalRectangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); // Print all pipeline reports first for (auto& entry : benchmarks) - entry.bench.logPipelineReport(entry.name); + entry.bench.logPipelineReport(entry.sampler + " (" + entry.mode + ")"); // Discrete sampler benchmark: alias table vs cumulative probability (BDA) { CDiscreteSamplerBenchmark::SetupData dsData; - dsData.device = m_device; - dsData.api = m_api; - dsData.assetMgr = m_assetMgr; - dsData.logger = m_logger; - dsData.physicalDevice = m_physicalDevice; - dsData.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - dsData.aliasShaderKey = nbl::this_example::builtin::build::get_spirv_key<"alias_table_bench">(m_device.get()); - dsData.cumProbShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get()); - dsData.dispatchGroupCount = testBatchCount; - dsData.tableSize = 1024; + dsData.device = m_device; + dsData.api = m_api; + dsData.assetMgr = m_assetMgr; + dsData.logger = m_logger; + dsData.physicalDevice = m_physicalDevice; + dsData.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); + dsData.aliasShaderKey = nbl::this_example::builtin::build::get_spirv_key<"alias_table_bench">(m_device.get()); + dsData.cumProbShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get()); + dsData.cumProbYoloShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get()); + dsData.dispatchGroupCount = testBatchCount; CDiscreteSamplerBenchmark discreteBench; discreteBench.setup(dsData); // Then run all benchmarks here so the reports are at the top of the log, followed by timings - constexpr uint32_t warmupDispatches = 500; - constexpr uint32_t benchDispatches = 5000; - m_logger->log("=== GPU Sampler Benchmarks (%u dispatches, %u threads/dispatch, %u iters/thread, ps/sample is per all GPU threads) ===", - ILogger::ELL_PERFORMANCE, benchDispatches, totalThreadsPerDispatch, iterationsPerThread); - for (auto& entry : benchmarks) - entry.bench.run(entry.name, warmupDispatches, benchDispatches); - - discreteBench.run(warmupDispatches, benchDispatches); + { + constexpr uint32_t warmupDispatches = 300; + constexpr uint32_t benchDispatches = 1000; + m_logger->log("=== GPU Sampler Benchmarks (%u dispatches, %u threads/dispatch, %u iters/thread, ps/sample is per all GPU threads) ===", + ILogger::ELL_PERFORMANCE, benchDispatches, totalThreadsPerDispatch, iterationsPerThread); + m_logger->log(" %-28s | %-38s | %12s | %12s | %12s", + ILogger::ELL_PERFORMANCE, "Sampler", "Mode", "ps/sample", "GSamples/s", "ms total"); + for (auto& entry : benchmarks) + entry.bench.run(entry.sampler, entry.mode, warmupDispatches, benchDispatches); + } + + { + // Sweep covers both the YOLO-vs-Comparator comparison (explicit points at + // N=100, 10k, 1M for wg=WORKGROUP_SIZE) and an alias-vs-CDF ramp from + // N=4 up to 32M in a roughly-power-of-8 progression. + const std::vector discreteSizes = { + 4u, + 16u, + 32u, + 100u, + 128u, + 512u, + 8192u, + 10000u, + 131072u, + 1000000u, + 2097152u, + 16777216u, + 33554432u, + }; + + // Adaptive dispatch scheduler: pick dispatch counts so total wall-clock + // per sampler-per-N stays near 1.5 s. Cost model comes from the prior + // sweep (order-of-magnitude ps/sample vs N). + auto dispatchScheduler = [](uint32_t N) -> CDiscreteSamplerBenchmark::DispatchCounts + { + double ps_per_sample; + if (N < 1000u) ps_per_sample = 15.0; // L1-resident + else if (N < 100000u) ps_per_sample = 100.0; // L1/L2 + else if (N < 2000000u) ps_per_sample = 1000.0; // L2-edge + else ps_per_sample = 8000.0; // DRAM-bound + + constexpr double targetNs = 1.5e9; // ~1.5 s per bench + constexpr uint64_t samplesPerDispatch = uint64_t(WORKGROUP_SIZE) * uint64_t(testBatchCount) * uint64_t(BENCH_ITERS); + const uint64_t targetSamples = uint64_t((targetNs * 1000.0) / ps_per_sample); + const uint32_t bench = std::max(10u, uint32_t(targetSamples / samplesPerDispatch)); + const uint32_t warmup = std::max(20u, bench / 10u); + return {warmup, bench}; + }; + + discreteBench.runSweep(discreteSizes, dispatchScheduler); + } } } @@ -270,21 +342,20 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // Runtime CPU/GPU comparison tests using ITester harness // ================================================================ bool pass = true; - const uint32_t workgroupSize = WORKGROUP_SIZE; // generic lambda to run a GPU sampler test auto runSamplerTest = [&](const char* testName, auto spirvKey, const char* logFile) { m_logger->log("Running %s tests...", ILogger::ELL_INFO, testName); typename Tester::PipelineSetupData data; - data.device = m_device; - data.api = m_api; - data.assetMgr = m_assetMgr; - data.logger = m_logger; - data.physicalDevice = m_physicalDevice; + data.device = m_device; + data.api = m_api; + data.assetMgr = m_assetMgr; + data.logger = m_logger; + data.physicalDevice = m_physicalDevice; data.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - data.shaderKey = spirvKey; - Tester tester(testBatchCount, workgroupSize); + data.shaderKey = spirvKey; + Tester tester(testBatchCount, WORKGROUP_SIZE); tester.setupPipeline(data); pass &= tester.performTestsAndVerifyResults(logFile); }; @@ -307,7 +378,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat runSamplerTest.operator()("ProjectedSphericalRectangle sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_test">(m_device.get()), "ProjectedSphericalRectangleTestLog.txt"); } - if constexpr (true) + if constexpr (DoBenchmark) { // --- Discrete table construction (CPU) --- { @@ -320,6 +391,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat runSamplerTest.operator()("AliasTable GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"alias_table_test">(m_device.get()), "AliasTableTestLog.txt"); runSamplerTest.operator()("CumulativeProbability GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_test">(m_device.get()), "CumulativeProbabilityTestLog.txt"); } + logJacobianSkipCounts(m_logger.get()); if (pass) m_logger->log("All sampling tests PASSED.", ILogger::ELL_INFO); else diff --git a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h index 87aac65ba..32f0e3b28 100644 --- a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h +++ b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h @@ -52,6 +52,7 @@ class CAliasTableGPUTester final : public ITesterlog(" coeffStart=%s coeffEnd=%s", nbl::system::ILogger::ELL_ERROR, - to_string(s.linearCoeffStart).c_str(), to_string(s.linearCoeffEnd).c_str()); + to_string(s.normalizedCoeffStart).c_str(), to_string(s.normalizedCoeffEnd).c_str()); } }; @@ -140,7 +141,7 @@ struct LinearStressConfig { using nbl::system::to_string; logger->log(" coeffStart=%s coeffEnd=%s", nbl::system::ILogger::ELL_ERROR, - to_string(s.linearCoeffStart).c_str(), to_string(s.linearCoeffEnd).c_str()); + to_string(s.normalizedCoeffStart).c_str(), to_string(s.normalizedCoeffEnd).c_str()); } }; diff --git a/37_HLSLSamplingTests/tests/CPolarMappingTester.h b/37_HLSLSamplingTests/tests/CPolarMappingTester.h index f7009176b..6c43f8877 100644 --- a/37_HLSLSamplingTests/tests/CPolarMappingTester.h +++ b/37_HLSLSamplingTests/tests/CPolarMappingTester.h @@ -46,7 +46,8 @@ class CPolarMappingTester final : public ITester sizeDist(0.5f, 3.0f); std::uniform_real_distribution uDist(0.0f, 1.0f); - ProjectedSphericalRectangleInputValues input; - // Observer at origin, rect placed in front (negative Z) so the solid angle is valid. - input.observer = nbl::hlsl::float32_t3(0.0f, 0.0f, 0.0f); - const float width = sizeDist(getRandomEngine()); - const float height = sizeDist(getRandomEngine()); - input.rectOrigin = nbl::hlsl::float32_t3(0.0f, 0.0f, -2.0f); - input.right = nbl::hlsl::float32_t3(width, 0.0f, 0.0f); - input.up = nbl::hlsl::float32_t3(0.0f, height, 0.0f); - - // Build shape to use centralized corner check nbl::hlsl::shapes::CompressedSphericalRectangle compressed; - compressed.origin = input.rectOrigin; - compressed.right = input.right; - compressed.up = input.up; + nbl::hlsl::float32_t3 observer; + generateRandomRectangle(getRandomEngine(), compressed, observer); + + ProjectedSphericalRectangleInputValues input; + input.observer = observer; + input.rectOrigin = compressed.origin; + input.right = compressed.right; + input.up = compressed.up; + auto shape = nbl::hlsl::shapes::SphericalRectangle::create(compressed); // Ensure the receiver normal has positive projection onto at least one vertex, @@ -63,25 +58,25 @@ class CProjectedSphericalRectangleTester final : public ITester actual.extents.x || - actual.surfaceOffset.y < 0.0f || actual.surfaceOffset.y > actual.extents.y) + PdfCheck {"ProjectedSphericalRectangle::forwardPdf", &R::forwardPdf}); + VERIFY_JACOBIAN_OR_SKIP(pass, "ProjectedSphericalRectangle::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 5e-2, 5e-2); + + constexpr float boundsEps = 1e-5f; + if (actual.surfaceOffset.x < -boundsEps || actual.surfaceOffset.x > actual.extents.x + boundsEps || + actual.surfaceOffset.y < -boundsEps || actual.surfaceOffset.y > actual.extents.y + boundsEps) { pass = false; - printTestFail("ProjectedSphericalRectangle::generateSurfaceOffset (inside rect bounds)", actual.extents, actual.surfaceOffset, iteration, seed, testType, 0.0, 0.0); + printTestFail("ProjectedSphericalRectangle::generateSurfaceOffset (inside rect bounds)", actual.extents, actual.surfaceOffset, iteration, seed, testType, 0.0, boundsEps); } // generate must be unit length @@ -90,7 +85,7 @@ class CProjectedSphericalRectangleTester final : public ITester createProjectedRectSampler( +inline nbl::hlsl::sampling::ProjectedSphericalRectangle createProjectedRectSampler( std::mt19937& rng, nbl::hlsl::shapes::CompressedSphericalRectangle& compressed, nbl::hlsl::float32_t3& observer, @@ -121,15 +116,16 @@ inline nbl::hlsl::sampling::ProjectedSphericalRectangle cr outNormal = generateRandomUnitVector(rng); } while (!anyRectCornerAboveHorizon(shape, observer, outNormal)); - return sampling::ProjectedSphericalRectangle::create(shape, observer, outNormal, false); + return sampling::ProjectedSphericalRectangle::create(shape, observer, outNormal, false); } struct ProjectedSphericalRectanglePropertyConfig { - using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle; + // UsePdfAsWeight=false so receiverNormal and projSolidAngle are populated for logSamplerInfo. + using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle; static constexpr uint32_t numConfigurations = 200; - static constexpr uint32_t samplesPerConfig = 20000; + static constexpr uint32_t samplesPerConfig = 50000; static constexpr bool hasMCNormalization = true; static constexpr bool hasGridIntegration = false; static constexpr float64_t mcNormalizationRelTol = 0.08; @@ -155,23 +151,20 @@ struct ProjectedSphericalRectanglePropertyConfig static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { using nbl::system::to_string; - logger->log(" r0=%s extents=%s solidAngle=%s rcpSolidAngle=%s rcpProjSolidAngle=%s", + logger->log(" r0=%s extents=%s solidAngle=%s projSolidAngle=%s receiverNormal=%s", nbl::system::ILogger::ELL_ERROR, to_string(s.sphrect.r0).c_str(), to_string(s.sphrect.extents).c_str(), to_string(s.sphrect.solidAngle).c_str(), - to_string(s.rcpSolidAngle).c_str(), - to_string(s.rcpProjSolidAngle).c_str()); - logger->log(" localReceiverNormal=%s receiverWasBSDF=%u", - nbl::system::ILogger::ELL_ERROR, - to_string(s.localReceiverNormal).c_str(), - static_cast(s.receiverWasBSDF)); + to_string(s.projSolidAngle).c_str(), + to_string(s.receiverNormal).c_str()); } }; struct ProjectedSphericalRectangleGrazingConfig { - using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle; + // UsePdfAsWeight=false so receiverNormal and projSolidAngle are populated for logSamplerInfo. + using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle; static constexpr uint32_t numConfigurations = 200; static constexpr uint32_t samplesPerConfig = 20000; @@ -202,17 +195,13 @@ struct ProjectedSphericalRectangleGrazingConfig static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { using nbl::system::to_string; - logger->log(" r0=%s extents=%s solidAngle=%s rcpSolidAngle=%s rcpProjSolidAngle=%s", + logger->log(" r0=%s extents=%s solidAngle=%s projSolidAngle=%s receiverNormal=%s", nbl::system::ILogger::ELL_ERROR, to_string(s.sphrect.r0).c_str(), to_string(s.sphrect.extents).c_str(), to_string(s.sphrect.solidAngle).c_str(), - to_string(s.rcpSolidAngle).c_str(), - to_string(s.rcpProjSolidAngle).c_str()); - logger->log(" localReceiverNormal=%s receiverWasBSDF=%u", - nbl::system::ILogger::ELL_ERROR, - to_string(s.localReceiverNormal).c_str(), - static_cast(s.receiverWasBSDF)); + to_string(s.projSolidAngle).c_str(), + to_string(s.receiverNormal).c_str()); } }; diff --git a/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h b/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h index 31f85ba02..0460a30ee 100644 --- a/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h +++ b/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h @@ -60,17 +60,19 @@ class CProjectedSphericalTriangleTester final : public ITester; + // UsePdfAsWeight=false so receiverNormal is populated for logSamplerInfo. + using sampler_type = nbl::hlsl::sampling::ProjectedSphericalTriangle; static constexpr uint32_t numConfigurations = 200; static constexpr uint32_t samplesPerConfig = 20000; @@ -117,18 +120,19 @@ struct ProjectedSphericalTrianglePropertyConfig // E[1/pdf] = solidAngle * E[1/bilinearPdf] = solidAngle * 1.0 = solidAngle static float64_t expectedCodomainMeasure(const sampler_type& s) { - return 1.0 / static_cast(s.sphtri.base.rcpSolidAngle); + return 1.0 / static_cast(s.sphtri.rcpSolidAngle); } static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { - logTriangleInfo(logger, s.sphtri.base.tri_vertices[0], s.sphtri.base.tri_vertices[1], s.sphtri.vertexC, s.receiverNormal); + logTriangleInfo(logger, s.sphtri.tri_vertices[0], s.sphtri.tri_vertices[1], s.sphtri.APlusC - s.sphtri.tri_vertices[0], s.receiverNormal); } }; struct ProjectedSphericalTriangleGrazingConfig { - using sampler_type = nbl::hlsl::sampling::ProjectedSphericalTriangle; + // UsePdfAsWeight=false so receiverNormal is populated for logSamplerInfo. + using sampler_type = nbl::hlsl::sampling::ProjectedSphericalTriangle; static constexpr uint32_t numConfigurations = 200; static constexpr uint32_t samplesPerConfig = 20000; @@ -169,12 +173,12 @@ struct ProjectedSphericalTriangleGrazingConfig static float64_t expectedCodomainMeasure(const sampler_type& s) { - return 1.0 / static_cast(s.sphtri.base.rcpSolidAngle); + return 1.0 / static_cast(s.sphtri.rcpSolidAngle); } static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { - logTriangleInfo(logger, s.sphtri.base.tri_vertices[0], s.sphtri.base.tri_vertices[1], s.sphtri.vertexC, s.receiverNormal); + logTriangleInfo(logger, s.sphtri.tri_vertices[0], s.sphtri.tri_vertices[1], s.sphtri.APlusC - s.sphtri.tri_vertices[0], s.receiverNormal); } }; diff --git a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h index 2a6030b78..fa5c93ccb 100644 --- a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h +++ b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h @@ -20,17 +20,17 @@ class CSphericalRectangleTester final : public ITester sizeDist(0.5f, 3.0f); std::uniform_real_distribution uDist(0.0f, 1.0f); + nbl::hlsl::shapes::CompressedSphericalRectangle compressed; + nbl::hlsl::float32_t3 observer; + generateRandomRectangle(getRandomEngine(), compressed, observer); + SphericalRectangleInputValues input; - // Observer at origin, rect placed in front (negative Z) so the solid angle is valid. - input.observer = nbl::hlsl::float32_t3(0.0f, 0.0f, 0.0f); - const float width = sizeDist(getRandomEngine()); - const float height = sizeDist(getRandomEngine()); - input.rectOrigin = nbl::hlsl::float32_t3(0.0f, 0.0f, -2.0f); - input.right = nbl::hlsl::float32_t3(width, 0.0f, 0.0f); - input.up = nbl::hlsl::float32_t3(0.0f, height, 0.0f); + input.observer = observer; + input.rectOrigin = compressed.origin; + input.right = compressed.right; + input.up = compressed.up; input.u = nbl::hlsl::float32_t2(uDist(getRandomEngine()), uDist(getRandomEngine())); m_inputs.push_back(input); return input; @@ -48,16 +48,21 @@ class CSphericalRectangleTester final : public ITester; + using sampler_type = nbl::hlsl::sampling::SphericalTriangle; static constexpr uint32_t numConfigurations = 500; static constexpr uint32_t samplesPerConfig = 20000; @@ -121,7 +124,7 @@ struct SphericalTrianglePropertyConfig static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { - logTriangleInfo(logger, s.base.tri_vertices[0], s.base.tri_vertices[1], s.vertexC); + logTriangleInfo(logger, s.tri_vertices[0], s.tri_vertices[1], s.APlusC - s.tri_vertices[0]); } }; @@ -130,7 +133,7 @@ struct SphericalTrianglePropertyConfig // These stress the C_s great-circle intersection and v-recovery in generateInverse. struct SphericalTriangleStressConfig { - using sampler_type = nbl::hlsl::sampling::SphericalTriangle; + using sampler_type = nbl::hlsl::sampling::SphericalTriangle; static constexpr uint32_t numConfigurations = 500; static constexpr uint32_t samplesPerConfig = 20000; @@ -218,7 +221,7 @@ struct SphericalTriangleStressConfig static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { - logTriangleInfo(logger, s.base.tri_vertices[0], s.base.tri_vertices[1], s.vertexC); + logTriangleInfo(logger, s.tri_vertices[0], s.tri_vertices[1], s.APlusC - s.tri_vertices[0]); } }; diff --git a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h index 29994511f..4f2ae08a4 100644 --- a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h +++ b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h @@ -45,7 +45,8 @@ class CUniformHemisphereTester final : public ITester& jacobianStats() +{ + static nbl::core::map s; + return s; +} +} // namespace detail + +inline void logJacobianSkipCounts(nbl::system::ILogger* logger) +{ + auto& stats = detail::jacobianStats(); + if (stats.empty()) + return; + logger->log("Jacobian skip summary (skipped samples are NOT counted as passes):", nbl::system::ILogger::ELL_INFO); + for (const auto& [name, s] : stats) + { + const uint64_t skipped = s.skipUDomain + s.skipCrease + s.skipHemiBoundary + s.skipBwdPdfRange + s.skipCodomainSingularity; + if (skipped == 0) + continue; + const double percentage = s.total ? (100.0 * double(skipped) / double(s.total)) : 0.0; + logger->log(" [JacobianSkip] %s: %llu / %llu skipped (%.2f%%) -- u-domain=%llu, crease=%llu, hemi-boundary=%llu, bwd-pdf-range=%llu, codomain-singularity=%llu", + nbl::system::ILogger::ELL_WARNING, + name.c_str(), + skipped, + s.total, + percentage, + s.skipUDomain, + s.skipCrease, + s.skipHemiBoundary, + s.skipBwdPdfRange, + s.skipCodomainSingularity); + } +} + +// Verify a jacobianProduct value OR bin it by reason if it is a skip sentinel (< 0). +// Skipped samples are counted by reason and NEVER counted as a pass. +// Must be called from a method that has access to verifyTestValue. +#define VERIFY_JACOBIAN_OR_SKIP(pass, name, expected, actual, iteration, seed, testType, relTol, absTol) \ + do \ + { \ + auto& _jstats = detail::jacobianStats()[(name)]; \ + ++_jstats.total; \ + const float _jval = (actual); \ + if (_jval < 0.0f) \ + { \ + /* Sentinel values are integers at -1..-5, so round-to-nearest on _jval picks the bin. */ \ + const int _bin = static_cast(-_jval + 0.5f); \ + switch (_bin) \ + { \ + case 1: \ + ++_jstats.skipUDomain; \ + break; \ + case 2: \ + ++_jstats.skipCrease; \ + break; \ + case 3: \ + ++_jstats.skipHemiBoundary; \ + break; \ + case 4: \ + ++_jstats.skipBwdPdfRange; \ + break; \ + case 5: \ + ++_jstats.skipCodomainSingularity; \ + break; \ + default: \ + ++_jstats.skipUDomain; \ + break; /* fall-through bucket */ \ + } \ + } \ + else \ + { \ + pass &= verifyTestValue((name), (expected), _jval, (iteration), (seed), (testType), (relTol), (absTol)); \ + } \ + } while (0) + // Check that each PDF field is positive and finite. // Must be called from within a method that has access to printTestFail. -#define VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType, ...) \ - do \ - { \ - auto _pdfChecks = std::make_tuple(__VA_ARGS__); \ - std::apply([&](const auto&... c) { (([&] { \ +#define VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType, ...) \ + do \ + { \ + auto _pdfChecks = std::make_tuple(__VA_ARGS__); \ + std::apply([&](const auto&... c) { (([&] { \ if (!((actual).*c.field > 0.0f) || !std::isfinite((actual).*c.field)) \ - { \ - pass = false; \ - printTestFail(std::string(c.name) + " (positive & finite)", \ - 1.0f, (actual).*c.field, iteration, seed, testType, 0.0, 0.0); \ - } \ - }()), \ - ...); }, _pdfChecks); \ + { \ + pass = false; \ + printTestFail(std::string(c.name) + " (positive & finite)", \ + 1.0f, (actual).*c.field, iteration, seed, testType, 0.0, 0.0); \ + } \ + }()), \ + ...); }, _pdfChecks); \ } while (0) // ============================================================================ @@ -139,7 +235,7 @@ inline float64_t gridIntegratePdf1D(const auto& sampler, uint32_t N = 100000) // 2D grid integration of backwardPdf over [0,1]^2 inline float64_t gridIntegratePdf2D(const auto& sampler, uint32_t N = 1000) { - float64_t sum = 0.0; + float64_t sum = 0.0; const float64_t cellArea = 1.0 / static_cast(N * N); for (uint32_t iy = 0; iy < N; iy++) { @@ -190,17 +286,15 @@ inline void buildTangentFrame(nbl::hlsl::float32_t3 dir, nbl::hlsl::float32_t3& // Generate a small equilateral triangle on the unit sphere around baseDir with given half-angle. // Also generates a random normal with decent projection onto the triangle. -inline void generateSmallTriangle(std::mt19937& rng, float halfAngle, - nbl::hlsl::float32_t3& v0, nbl::hlsl::float32_t3& v1, nbl::hlsl::float32_t3& v2, - nbl::hlsl::float32_t3& baseDir, nbl::hlsl::float32_t3& normal) +inline void generateSmallTriangle(std::mt19937& rng, float halfAngle, nbl::hlsl::float32_t3& v0, nbl::hlsl::float32_t3& v1, nbl::hlsl::float32_t3& v2, nbl::hlsl::float32_t3& baseDir, nbl::hlsl::float32_t3& normal) { using namespace nbl::hlsl; baseDir = generateRandomUnitVector(rng); float32_t3 t1, t2; buildTangentFrame(baseDir, t1, t2); - v0 = normalize(baseDir + t1 * halfAngle); - v1 = normalize(baseDir - t1 * (halfAngle * 0.5f) + t2 * (halfAngle * 0.866f)); - v2 = normalize(baseDir - t1 * (halfAngle * 0.5f) - t2 * (halfAngle * 0.866f)); + v0 = normalize(baseDir + t1 * halfAngle); + v1 = normalize(baseDir - t1 * (halfAngle * 0.5f) + t2 * (halfAngle * 0.866f)); + v2 = normalize(baseDir - t1 * (halfAngle * 0.5f) - t2 * (halfAngle * 0.866f)); normal = generateRandomUnitVector(rng); if (dot(normal, baseDir) < 0.1f) normal = normalize(normal + baseDir * 2.0f); @@ -221,10 +315,10 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32 float32_t3 t1, t2; buildTangentFrame(base, t1, t2); float spread = 0.15f + angleDist(rng) * 0.2f; - v0 = normalize(base + t1 * spread); - v1 = normalize(base - t1 * spread); - float far_ = 0.8f + angleDist(rng) * 0.8f; - v2 = normalize(base * std::cos(far_) + t2 * std::sin(far_)); + v0 = normalize(base + t1 * spread); + v1 = normalize(base - t1 * spread); + float far_ = 0.8f + angleDist(rng) * 0.8f; + v2 = normalize(base * std::cos(far_) + t2 * std::sin(far_)); break; } case 1: // Nearly coplanar @@ -233,12 +327,12 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32 float32_t3 t1, t2; buildTangentFrame(pole, t1, t2); float offset = 0.05f + angleDist(rng) * 0.1f; - float a1 = angleDist(rng) * 6.2832f; - float a2 = a1 + 0.8f + angleDist(rng); - float a3 = a2 + 0.8f + angleDist(rng); - v0 = normalize(t1 * std::cos(a1) + t2 * std::sin(a1) + pole * offset); - v1 = normalize(t1 * std::cos(a2) + t2 * std::sin(a2) - pole * offset * 0.5f); - v2 = normalize(t1 * std::cos(a3) + t2 * std::sin(a3) + pole * offset * 0.3f); + float a1 = angleDist(rng) * 6.2832f; + float a2 = a1 + 0.8f + angleDist(rng); + float a3 = a2 + 0.8f + angleDist(rng); + v0 = normalize(t1 * std::cos(a1) + t2 * std::sin(a1) + pole * offset); + v1 = normalize(t1 * std::cos(a2) + t2 * std::sin(a2) - pole * offset * 0.5f); + v2 = normalize(t1 * std::cos(a3) + t2 * std::sin(a3) + pole * offset * 0.3f); break; } default: // One short edge @@ -247,9 +341,9 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32 float32_t3 t1, t2; buildTangentFrame(base, t1, t2); float shortAngle = 0.32f + angleDist(rng) * 0.1f; - v0 = normalize(base + t1 * shortAngle * 0.5f); - v1 = normalize(base - t1 * shortAngle * 0.5f); - v2 = normalize(t2 + base * (0.3f + angleDist(rng) * 0.5f)); + v0 = normalize(base + t1 * shortAngle * 0.5f); + v1 = normalize(base - t1 * shortAngle * 0.5f); + v2 = normalize(t2 + base * (0.3f + angleDist(rng) * 0.5f)); break; } } @@ -262,65 +356,114 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32 inline void makeEquilateralTriangle(float64_t theta, nbl::hlsl::float32_t3 verts[3]) { using namespace nbl::hlsl; - const float32_t st = static_cast(std::sin(theta)); - const float32_t ct = static_cast(std::cos(theta)); + const float32_t st = static_cast(std::sin(theta)); + const float32_t ct = static_cast(std::cos(theta)); constexpr float64_t twoPiOver3 = 2.0 * numbers::pi / 3.0; - verts[0] = float32_t3(st, 0.0f, ct); - verts[1] = float32_t3(static_cast(st * std::cos(twoPiOver3)), + verts[0] = float32_t3(st, 0.0f, ct); + verts[1] = float32_t3(static_cast(st * std::cos(twoPiOver3)), static_cast(st * std::sin(twoPiOver3)), ct); - verts[2] = float32_t3(static_cast(st * std::cos(2.0 * twoPiOver3)), + verts[2] = float32_t3(static_cast(st * std::cos(2.0 * twoPiOver3)), static_cast(st * std::sin(2.0 * twoPiOver3)), ct); } -// Monte Carlo estimate of projected solid angle: E[abs(dot(L, normal))] * solidAngle. -// Uses abs() to match the BSDF projected solid angle formula (which uses abs so that -// triangles straddling the horizon contribute positively from both hemispheres). -// Samples L uniformly from the spherical triangle. -inline float64_t mcEstimatePSA(const nbl::hlsl::shapes::SphericalTriangle& shape, nbl::hlsl::float32_t3 normal, uint32_t N, std::mt19937& rng) +// Grid estimate of projected solid angle: mean of abs(dot(L, normal)) over a regular +// [0,1]^2 grid, times solidAngle. Uses abs() to match the BSDF projected solid angle +// formula (triangles/rects straddling the horizon contribute from both hemispheres). +// `N` is the total number of samples; the grid side is ceil(sqrt(N)). Grid integration +// is deterministic and has much lower variance than MC at the same sample count, +// so it's a tighter ground truth for PSA-vs-formula comparisons. +inline float64_t gridEstimatePSA(const nbl::hlsl::shapes::SphericalTriangle& shape, nbl::hlsl::float32_t3 normal, uint32_t N) { using namespace nbl::hlsl; - auto sampler = sampling::SphericalTriangle::create(shape); - std::uniform_real_distribution uDist(0.0f, 1.0f); - float64_t sum = 0.0; - for (uint32_t i = 0; i < N; i++) + auto sampler = sampling::SphericalTriangle::create(shape); + const uint32_t gridSide = static_cast(std::ceil(std::sqrt(static_cast(N)))); + const float invSide = 1.0f / static_cast(gridSide); + float64_t sum = 0.0; + for (uint32_t iy = 0; iy < gridSide; iy++) { - float32_t2 u(uDist(rng), uDist(rng)); - typename sampling::SphericalTriangle::cache_type cache; - float32_t3 L = sampler.generate(u, cache); - sum += static_cast(hlsl::abs(dot(normal, L))); + const float uy = (static_cast(iy) + 0.5f) * invSide; + for (uint32_t ix = 0; ix < gridSide; ix++) + { + const float ux = (static_cast(ix) + 0.5f) * invSide; + typename sampling::SphericalTriangle::cache_type cache; + const float32_t3 L = sampler.generate(float32_t2(ux, uy), cache); + sum += static_cast(hlsl::abs(dot(normal, L))); + } } - return sum / static_cast(N) * static_cast(shape.solid_angle); + return sum / static_cast(gridSide * gridSide) * static_cast(shape.solid_angle); } -// Monte Carlo estimate of projected solid angle for a rectangle: E[abs(dot(L, normal))] * solidAngle. -// Uses abs() to match the BSDF projected solid angle formula. -// Samples uniformly from the spherical rectangle, reconstructs world-space direction. -inline float64_t mcEstimatePSA( +// Sampler-independent PSA reference for rectangles. Integrates the projected-solid-angle integral +// PSA = integral over rect surface of |cos(theta_receiver)| * |cos(theta_rect)| / d^2 dA +// on a uniform surface grid in (s, t) in [0, extents.x] x [0, extents.y]. No sampler involved, +// so disagreement with a sampler-derived PSA isolates the sampler / formula. +inline float64_t surfaceGridEstimatePSA( const nbl::hlsl::shapes::SphericalRectangle& shape, const nbl::hlsl::float32_t3& observer, const nbl::hlsl::float32_t3& normal, - uint32_t N, std::mt19937& rng) + uint32_t N) +{ + using namespace nbl::hlsl; + const float32_t3 rdir = shape.basis[0]; + const float32_t3 udir = shape.basis[1]; + const float32_t3 rectNormal = shape.basis[2]; + const float32_t width = shape.extents.x; + const float32_t height = shape.extents.y; + const uint32_t gridSide = static_cast(std::ceil(std::sqrt(static_cast(N)))); + const float64_t cellArea = static_cast(width) * static_cast(height) / static_cast(gridSide * gridSide); + float64_t sum = 0.0; + for (uint32_t iy = 0; iy < gridSide; iy++) + { + const float32_t t = (static_cast(iy) + 0.5f) * height / static_cast(gridSide); + for (uint32_t ix = 0; ix < gridSide; ix++) + { + const float32_t s = (static_cast(ix) + 0.5f) * width / static_cast(gridSide); + const float32_t3 worldPt = shape.origin + rdir * s + udir * t; + const float32_t3 toSurf = worldPt - observer; + const float64_t d2 = static_cast(dot(toSurf, toSurf)); + const float64_t d = std::sqrt(d2); + const float32_t3 L = toSurf * static_cast(1.0 / d); + const float64_t cosRx = static_cast(hlsl::abs(dot(normal, L))); + const float64_t cosRt = static_cast(hlsl::abs(dot(rectNormal, L))); + sum += cosRx * cosRt / d2; + } + } + return sum * cellArea; +} + +// Grid estimate of projected solid angle for a rectangle: mean of abs(dot(L, normal)) +// over a regular [0,1]^2 grid, times solidAngle. See the triangle overload above. +inline float64_t gridEstimatePSA( + const nbl::hlsl::shapes::SphericalRectangle& shape, + const nbl::hlsl::float32_t3& observer, + const nbl::hlsl::float32_t3& normal, + uint32_t N) { using namespace nbl::hlsl; auto sampler = sampling::SphericalRectangle::create(shape, observer); if (sampler.solidAngle <= 0.0f || !std::isfinite(sampler.solidAngle)) return 0.0; - std::uniform_real_distribution uDist(0.0f, 1.0f); - float64_t sum = 0.0; - for (uint32_t i = 0; i < N; i++) + const uint32_t gridSide = static_cast(std::ceil(std::sqrt(static_cast(N)))); + const float invSide = 1.0f / static_cast(gridSide); + float64_t sum = 0.0; + for (uint32_t iy = 0; iy < gridSide; iy++) { - float32_t2 u(uDist(rng), uDist(rng)); - typename sampling::SphericalRectangle::cache_type cache; - float32_t2 gen = sampler.generateSurfaceOffset(u, cache); - // Reconstruct world-space direction from rectangle offset - float32_t3 worldPt = shape.origin - + shape.basis[0] * gen.x - + shape.basis[1] * gen.y; - float32_t3 L = normalize(worldPt - observer); - sum += static_cast(hlsl::abs(dot(normal, L))); + const float uy = (static_cast(iy) + 0.5f) * invSide; + for (uint32_t ix = 0; ix < gridSide; ix++) + { + const float ux = (static_cast(ix) + 0.5f) * invSide; + typename sampling::SphericalRectangle::cache_type cache; + // `generateLocalBasisXY` returns absolute (xu, yv) on the rectangle surface; subtract r0.xy + // to get the offset-from-r0 that the world-space reconstruction below expects. + const float32_t2 absXY = sampler.generateLocalBasisXY(float32_t2(ux, uy), cache); + const float32_t2 gen = absXY - float32_t2(sampler.r0.x, sampler.r0.y); + const float32_t3 worldPt = shape.origin + shape.basis[0] * gen.x + shape.basis[1] * gen.y; + const float32_t3 L = normalize(worldPt - observer); + sum += static_cast(hlsl::abs(dot(normal, L))); + } } - return sum / static_cast(N) * static_cast(sampler.solidAngle); + return sum / static_cast(gridSide * gridSide) * static_cast(sampler.solidAngle); } // Bundles seed + rng + failCount for randomized property tests. @@ -357,14 +500,18 @@ struct SeededTestContext } }; -// Generic PSA vs MC comparison. -// ConfigGen: void(std::mt19937& rng, uint32_t index, float64_t& formulaPSA, float64_t& mcPSA, InfoLogger& info) -// Must set formulaPSA and mcPSA for config `index`, or set both to 0 to skip. +// Generic PSA vs grid-integration comparison. +// ConfigGen: void(std::mt19937& rng, uint32_t index, float64_t& formulaPSA, float64_t& gridPSA, InfoLogger& info) +// Must set formulaPSA and gridPSA for config `index`, or set both to 0 to skip. // `info` is a callable: void(nbl::system::ILogger*, nbl::system::ILogger::E_LOG_LEVEL) that logs // sampler/shape details for the current config. Called on mismatch. -// When diagnostic=true, failures log at ELL_WARNING instead of ELL_ERROR (non-hard-fail). +// Two-tier tolerance: +// - (relTol, absTol): soft threshold. Exceedance counts as a mismatch. With diagnostic=true +// the run still returns true (known-limitation noise); with diagnostic=false it hard-fails. +// - (hardRelTol, hardAbsTol): egregious threshold. Always hard-fails regardless of diagnostic, +// so a catastrophic regression can't hide inside the warning stream. template -inline bool testPSAVersusMonteCarlo( +inline bool testPSAVersusGrid( nbl::system::ILogger* logger, const char* tag, const char* label, @@ -372,49 +519,78 @@ inline bool testPSAVersusMonteCarlo( uint32_t numConfigs, float64_t relTol, float64_t absTol, + float64_t hardRelTol, + float64_t hardAbsTol, bool diagnostic = false) { - const auto failLevel = diagnostic ? nbl::system::ILogger::ELL_WARNING : nbl::system::ILogger::ELL_ERROR; + const auto softFailLevel = diagnostic ? nbl::system::ILogger::ELL_WARNING : nbl::system::ILogger::ELL_ERROR; SeededTestContext ctx; + uint32_t hardFailCount = 0; + uint32_t testedCount = 0; for (uint32_t c = 0; c < numConfigs; c++) { - float64_t formulaPSA = 0.0, mcPSA = 0.0; + float64_t formulaPSA = 0.0, gridPSA = 0.0; std::function logInfo = - [](nbl::system::ILogger*, nbl::system::ILogger::E_LOG_LEVEL) {}; - configGenerator(ctx.rng, c, formulaPSA, mcPSA, logInfo); + [](nbl::system::ILogger*, nbl::system::ILogger::E_LOG_LEVEL) { + }; + configGenerator(ctx.rng, c, formulaPSA, gridPSA, logInfo); - if (mcPSA == 0.0 && formulaPSA == 0.0) + if (gridPSA == 0.0 && formulaPSA == 0.0) continue; + testedCount++; + + const float64_t absErr = std::abs(formulaPSA - gridPSA); + const float64_t relErr = (std::abs(gridPSA) > 1e-10) ? absErr / std::abs(gridPSA) : 0.0; - const float64_t absErr = std::abs(formulaPSA - mcPSA); - const float64_t relErr = (std::abs(mcPSA) > 1e-10) ? absErr / std::abs(mcPSA) : 0.0; + const bool softFail = relErr > relTol && absErr > absTol; + const bool hardFail = relErr > hardRelTol && absErr > hardAbsTol; - if (relErr > relTol && absErr > absTol) + if (softFail) { ctx.failCount++; + if (hardFail) + hardFailCount++; if (ctx.failCount <= 5) { - logger->log(" [%s] %s mismatch: formula=%f expected(MC)=%f relErr=%e absErr=%e config %u", - failLevel, tag, label, formulaPSA, mcPSA, relErr, absErr, c); - logInfo(logger, failLevel); + const auto level = hardFail ? nbl::system::ILogger::ELL_ERROR : softFailLevel; + logger->log(" [%s] %s %s: formula=%f expected(grid)=%f relErr=%e absErr=%e config %u", + level, tag, label, hardFail ? "HARD mismatch" : "mismatch", + formulaPSA, gridPSA, relErr, absErr, c); + logInfo(logger, level); } } } + const uint32_t skippedCount = numConfigs - testedCount; + if (ctx.failCount == 0) - logger->log(" [%s] %s PASSED (%u configs, relTol=%e absTol=%e)", - nbl::system::ILogger::ELL_PERFORMANCE, tag, label, numConfigs, relTol, absTol); - else { - logger->log(" [%s] %s FAILED (%u/%u configs exceeded tolerance, relTol=%e absTol=%e)", - failLevel, tag, label, ctx.failCount, numConfigs, relTol, absTol); - if (diagnostic) - logger->log(" [%s] reproduce with seed=%u (diagnostic only, not a hard failure)", - nbl::system::ILogger::ELL_WARNING, tag, ctx.seed); + logger->log(" [%s] %s PASSED (%u tested, %u skipped of %u requested, relTol=%e absTol=%e)", + nbl::system::ILogger::ELL_PERFORMANCE, tag, label, + testedCount, skippedCount, numConfigs, relTol, absTol); + return true; } - return diagnostic ? true : ctx.finalize(logger, tag); + const bool hardFailed = hardFailCount > 0; + const auto summaryLevel = hardFailed ? nbl::system::ILogger::ELL_ERROR : softFailLevel; + if (hardFailed) + logger->log(" [%s] %s FAILED (%u/%u exceeded soft tol, %u/%u exceeded HARD tol, %u skipped of %u, hardRelTol=%e hardAbsTol=%e)", + summaryLevel, tag, label, ctx.failCount, testedCount, hardFailCount, testedCount, + skippedCount, numConfigs, hardRelTol, hardAbsTol); + else + logger->log(" [%s] %s FAILED (%u/%u configs exceeded tolerance, %u skipped of %u, relTol=%e absTol=%e)", + summaryLevel, tag, label, ctx.failCount, testedCount, skippedCount, numConfigs, relTol, absTol); + + const bool shouldHardFail = hardFailed || !diagnostic; + if (shouldHardFail) + logger->log(" [%s] reproduce with seed=%u", + nbl::system::ILogger::ELL_ERROR, tag, ctx.seed); + else + logger->log(" [%s] reproduce with seed=%u (diagnostic only, not a hard failure)", + nbl::system::ILogger::ELL_WARNING, tag, ctx.seed); + + return !shouldHardFail; } // ============================================================================ @@ -435,23 +611,21 @@ inline void generateRandomRectangle(std::mt19937& rng, float32_t3 t1, t2; buildTangentFrame(normal, t1, t2); - const float width = sizeDist(rng); + const float width = sizeDist(rng); const float height = sizeDist(rng); - const float dist = distDist(rng); + const float dist = distDist(rng); - observer = float32_t3(offsetDist(rng), offsetDist(rng), offsetDist(rng)); + observer = float32_t3(offsetDist(rng), offsetDist(rng), offsetDist(rng)); compressed.origin = observer - normal * dist + t1 * offsetDist(rng) + t2 * offsetDist(rng); - compressed.right = t1 * width; - compressed.up = t2 * height; + compressed.right = t1 * width; + compressed.up = t2 * height; } // Stress rectangles: ill-conditioned geometries that exercise edge cases. // - Extreme aspect ratio (10:1 to 20:1) // - Grazing angle (observer nearly in the rectangle plane) // - Observer near corner (most of the rectangle off to one side) -inline void generateStressRectangle(std::mt19937& rng, - nbl::hlsl::shapes::CompressedSphericalRectangle& compressed, - nbl::hlsl::float32_t3& observer) +inline void generateStressRectangle(std::mt19937& rng, nbl::hlsl::shapes::CompressedSphericalRectangle& compressed, nbl::hlsl::float32_t3& observer) { using namespace nbl::hlsl; std::uniform_real_distribution uDist(0.0f, 1.0f); @@ -464,39 +638,39 @@ inline void generateStressRectangle(std::mt19937& rng, switch (caseDist(rng)) { case 0: // Extreme aspect ratio - { - const float longSide = 3.0f + uDist(rng) * 5.0f; - const float shortSide = 0.1f + uDist(rng) * 0.2f; - const float dist = 1.5f + uDist(rng) * 2.0f; - observer = float32_t3(0.0f, 0.0f, 0.0f); - compressed.origin = -normal * dist - t1 * (longSide * 0.5f) - t2 * (shortSide * 0.5f); - compressed.right = t1 * longSide; - compressed.up = t2 * shortSide; - break; - } + { + const float longSide = 3.0f + uDist(rng) * 5.0f; + const float shortSide = 0.1f + uDist(rng) * 0.2f; + const float dist = 1.5f + uDist(rng) * 2.0f; + observer = float32_t3(0.0f, 0.0f, 0.0f); + compressed.origin = -normal * dist - t1 * (longSide * 0.5f) - t2 * (shortSide * 0.5f); + compressed.right = t1 * longSide; + compressed.up = t2 * shortSide; + break; + } case 1: // Grazing angle (observer nearly in the rectangle plane) - { - const float width = 1.0f + uDist(rng) * 2.0f; - const float height = 1.0f + uDist(rng) * 2.0f; - const float normalDist = 0.05f + uDist(rng) * 0.15f; - const float tangentOffset = 0.5f + uDist(rng) * 1.0f; - observer = float32_t3(0.0f, 0.0f, 0.0f); - compressed.origin = -normal * normalDist + t1 * tangentOffset - t2 * (height * 0.5f); - compressed.right = t1 * width; - compressed.up = t2 * height; - break; - } + { + const float width = 1.0f + uDist(rng) * 2.0f; + const float height = 1.0f + uDist(rng) * 2.0f; + const float normalDist = 0.05f + uDist(rng) * 0.15f; + const float tangentOffset = 0.5f + uDist(rng) * 1.0f; + observer = float32_t3(0.0f, 0.0f, 0.0f); + compressed.origin = -normal * normalDist + t1 * tangentOffset - t2 * (height * 0.5f); + compressed.right = t1 * width; + compressed.up = t2 * height; + break; + } default: // Observer near corner - { - const float width = 2.0f + uDist(rng) * 3.0f; - const float height = 2.0f + uDist(rng) * 3.0f; - const float dist = 0.5f + uDist(rng) * 1.0f; - observer = float32_t3(0.0f, 0.0f, 0.0f); - compressed.origin = -normal * dist - t1 * (0.05f + uDist(rng) * 0.1f) - t2 * (0.05f + uDist(rng) * 0.1f); - compressed.right = t1 * width; - compressed.up = t2 * height; - break; - } + { + const float width = 2.0f + uDist(rng) * 3.0f; + const float height = 2.0f + uDist(rng) * 3.0f; + const float dist = 0.5f + uDist(rng) * 1.0f; + observer = float32_t3(0.0f, 0.0f, 0.0f); + compressed.origin = -normal * dist - t1 * (0.05f + uDist(rng) * 0.1f) - t2 * (0.05f + uDist(rng) * 0.1f); + compressed.right = t1 * width; + compressed.up = t2 * height; + break; + } } } @@ -590,10 +764,10 @@ inline void logRectInfo( { using namespace nbl::system; using namespace nbl::hlsl; - const float width = length(compressed.right); - const float height = length(compressed.up); + const float width = length(compressed.right); + const float height = length(compressed.up); const float32_t3 normal = normalize(cross(compressed.right, compressed.up)); - const float dist = length(compressed.origin - observer); + const float dist = length(compressed.origin - observer); logger->log(" origin=%s right=%s up=%s observer=%s", ILogger::ELL_ERROR, to_string(compressed.origin).c_str(), @@ -617,14 +791,14 @@ inline bool anyRectCornerAboveHorizon( const nbl::hlsl::float32_t3& normal) { using namespace nbl::hlsl; - const float32_t3 r0 = mul(shape.basis, shape.origin - observer); + const float32_t3 r0 = mul(shape.basis, shape.origin - observer); const float32_t3 localN = mul(shape.basis, normal); - const float32_t3 v0 = normalize(r0); - const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f)); - const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f)); - const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f)); + const float32_t3 v0 = normalize(r0); + const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f)); + const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f)); + const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f)); return dot(localN, v0) > 0.0f || dot(localN, v1) > 0.0f || - dot(localN, v2) > 0.0f || dot(localN, v3) > 0.0f; + dot(localN, v2) > 0.0f || dot(localN, v3) > 0.0f; } // True if all rectangle corners have positive NdotL with the given normal. @@ -635,14 +809,14 @@ inline bool allRectCornersAboveHorizon( const nbl::hlsl::float32_t3& normal) { using namespace nbl::hlsl; - const float32_t3 r0 = mul(shape.basis, shape.origin - observer); + const float32_t3 r0 = mul(shape.basis, shape.origin - observer); const float32_t3 localN = mul(shape.basis, normal); - const float32_t3 v0 = normalize(r0); - const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f)); - const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f)); - const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f)); + const float32_t3 v0 = normalize(r0); + const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f)); + const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f)); + const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f)); return dot(localN, v0) > 0.0f && dot(localN, v1) > 0.0f && - dot(localN, v2) > 0.0f && dot(localN, v3) > 0.0f; + dot(localN, v2) > 0.0f && dot(localN, v3) > 0.0f; } #endif diff --git a/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h b/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h index cb28b63fc..ecb0f606d 100644 --- a/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h +++ b/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h @@ -414,6 +414,12 @@ class CSphericalTriangleGenerateTester auto sampler = sampling::SphericalTriangle::create(shape); const float64_t SA = static_cast(shape.solid_angle); + // Float32 solid angle (acos sum - pi) loses precision for small + // triangles due to catastrophic cancellation, making the expected + // sub-solid-angle ratio unreliable as a reference value. + // At SA ~ 0.003, the relative error in float32 solid angles reaches + // ~1-3%, comparable to the half-space counting tolerance. + const bool tinyTriangle = SA < 4e-3; // For each cut: pick a vertex and a point on the opposite edge, // forming a great circle that splits the triangle in two. @@ -482,12 +488,20 @@ class CSphericalTriangleGenerateTester testedCuts++; if (absErr > relTol) { - ctx.failCount++; - if (ctx.failCount <= 5) + if (tinyTriangle) { - m_logger->log("[SphericalTriangle::generate] %s half-space: observed=%f expected=%f absErr=%e (tol=%e) tri %u cut %u", - system::ILogger::ELL_ERROR, label, observedFraction, expectedFraction, absErr, relTol, t, c); - logTriangleInfo(m_logger, v0, v1, v2); + m_logger->log("[SphericalTriangle::generate] %s half-space: observed=%f expected=%f absErr=%e (tol=%e) tri %u cut %u -- solid angle %e too small for float32, especially on GPU", + system::ILogger::ELL_WARNING, label, observedFraction, expectedFraction, absErr, relTol, t, c, SA); + } + else + { + ctx.failCount++; + if (ctx.failCount <= 5) + { + m_logger->log("[SphericalTriangle::generate] %s half-space: observed=%f expected=%f absErr=%e (tol=%e) tri %u cut %u", + system::ILogger::ELL_ERROR, label, observedFraction, expectedFraction, absErr, relTol, t, c); + logTriangleInfo(m_logger, v0, v1, v2); + } } } } @@ -504,12 +518,20 @@ class CSphericalTriangleGenerateTester } // ------------------------------------------------------------------------- - // Moment matching: E[dot(generate(u), N)] should equal PSA(N) / SA. + // Moment matching: E[dot(generate(u), N)] should equal signedPSA(N) / SA. // // For a uniform distribution over a spherical triangle: // E[f(L)] = (1/SA) * integral_triangle f(L) dw // - // Choosing f(L) = dot(L, N) gives E[dot(L, N)] = PSA(N) / SA. + // Choosing f(L) = dot(L, N) gives E[dot(L, N)] = signedPSA(N) / SA, + // where signedPSA is the exact signed projected solid angle computed + // via the Kelvin-Stokes theorem: + // signedPSA(N) = 0.5 * sum_edges dot(edgeNormal_i, N) * edgeArcLength_i + // + // Note: shapes::SphericalTriangle::projectedSolidAngle() returns a signed result + // (Kelvin-Stokes signed sum); tests abs() the return to compare against the + // |cos(theta)| (BSDF) PSA integral reference. + // // If generate() has a systematic bias (e.g., concentrating samples // near one vertex), this moment will be wrong for most directions N. // Testing multiple random N per triangle makes it very unlikely that @@ -533,11 +555,34 @@ class CSphericalTriangleGenerateTester auto sampler = sampling::SphericalTriangle::create(shape); const float64_t SA = static_cast(shape.solid_angle); + // Precompute edge normals and arc lengths for the signed PSA formula. + // cross(v_j, v_k) * csc_sides[i] gives outward-pointing edge normals + // only when the vertices are CCW as seen from outside the sphere. + // The sign of the triple product dot(v0, cross(v1, v2)) tells us the + // winding: positive = CCW (outward normals), negative = CW (inward). + const float32_t3 crossBC = hlsl::cross(shape.vertices[1], shape.vertices[2]); + const float64_t windingSign = (hlsl::dot(shape.vertices[0], crossBC) >= 0.0f) ? 1.0 : -1.0; + const float32_t3 edgeNormals[3] = { + crossBC * shape.csc_sides[0], + hlsl::cross(shape.vertices[2], shape.vertices[0]) * shape.csc_sides[1], + hlsl::cross(shape.vertices[0], shape.vertices[1]) * shape.csc_sides[2] + }; + const float64_t edgeAngles[3] = { + std::acos(static_cast(hlsl::clamp(shape.cos_sides[0], -1.0f, 1.0f))), + std::acos(static_cast(hlsl::clamp(shape.cos_sides[1], -1.0f, 1.0f))), + std::acos(static_cast(hlsl::clamp(shape.cos_sides[2], -1.0f, 1.0f))) + }; + for (uint32_t n = 0; n < numNormals; n++) { float32_t3 N = generateRandomUnitVector(ctx.rng); - const float64_t psa = static_cast(shape.projectedSolidAngle(N)); - const float64_t expected = psa / SA; + + // Signed PSA via Kelvin-Stokes: exact for integral dot(L,N) dOmega + float64_t signedPSA = 0.0; + for (uint32_t e = 0; e < 3; e++) + signedPSA += static_cast(hlsl::dot(edgeNormals[e], N)) * edgeAngles[e]; + signedPSA *= 0.5 * windingSign; + const float64_t expected = signedPSA / SA; float64_t sum = 0.0; std::uniform_real_distribution uDist(0.0f, 1.0f); @@ -546,7 +591,7 @@ class CSphericalTriangleGenerateTester float32_t2 u(uDist(ctx.rng), uDist(ctx.rng)); typename sampling::SphericalTriangle::cache_type cache; float32_t3 L = sampler.generate(u, cache); - sum += static_cast(hlsl::abs(dot(L, N))); + sum += static_cast(dot(L, N)); } const float64_t mcEstimate = sum / static_cast(numSamples); @@ -601,7 +646,7 @@ class CSphericalTriangleGenerateTester if (shape.solid_angle <= 0.0f || !std::isfinite(shape.solid_angle)) continue; - auto sampler = sampling::SphericalTriangle::create(shape); + auto sampler = sampling::SphericalTriangle::create(shape); std::uniform_real_distribution uDist(0.0f, 1.0f); for (uint32_t i = 0; i < samplesPerTriangle; i++) @@ -742,7 +787,7 @@ class CSphericalTriangleGenerateTester // Tests two aspects of projected spherical triangles: // // 1. PSA formula accuracy: shapes::SphericalTriangle::projectedSolidAngle -// against Monte Carlo ground truth (PSA = integral_{tri} abs(dot(L,N)) dOmega). +// against grid-integration ground truth (PSA = integral_{tri} abs(dot(L,N)) dOmega). // // 2. PST sampler accuracy: how well ProjectedSphericalTriangle's bilinear // importance sampling approximates the true NdotL distribution, and @@ -767,18 +812,21 @@ class CProjectedSphericalTriangleGeometricTester // when edge normals have mixed signs, even when all vertices are above the horizon. // These tests are diagnostic-only until proper hemisphere clipping is implemented. // TODO: make these hard failures once projectedSolidAngle clips to the hemisphere. - testPSAVersusMonteCarlo("random MC", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal) + // Hard-fail thresholds: relErr > 3.0 AND absErr > 0.3 means the formula is catastrophically + // wrong, not just affected by the known abs()-overcount limitation. Catches regressions that + // would otherwise hide in the warning stream. + pass &= testPSAVersusGrid("random", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal) { generateRandomTriangleVertices(rng, v0, v1, v2); - normal = generateRandomUnitVector(rng); }, 200, 500000, 0.05, 0.01, true); - testPSAVersusMonteCarlo("grazing MC", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal) + normal = generateRandomUnitVector(rng); }, 200, 500000, 0.05, 0.01, 3.0, 0.3, true); + pass &= testPSAVersusGrid("grazing", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal) { generateRandomTriangleVertices(rng, v0, v1, v2); float32_t3 triCenter = normalize(v0 + v1 + v2); float32_t3 tangent, unused; buildTangentFrame(triCenter, tangent, unused); std::uniform_real_distribution grazeDist(0.02f, 0.15f); - normal = normalize(tangent + triCenter * grazeDist(rng)); }, 200, 500000, 0.1, 0.01, true); + normal = normalize(tangent + triCenter * grazeDist(rng)); }, 200, 500000, 0.1, 0.01, 3.0, 0.3, true); // Also diagnostic -- same abs() issue affects small triangles testPSASmallTriangle(); @@ -860,7 +908,7 @@ class CProjectedSphericalTriangleGeometricTester // Known analytic cases bool testPSAKnownCases() { - constexpr float64_t psaOctantMCRelTol = 0.05; + constexpr float64_t psaOctantGridRelTol = 0.05; constexpr float64_t psaSymmetryRelTol = 1e-4; SeededTestContext ctx; @@ -872,51 +920,52 @@ class CProjectedSphericalTriangleGeometricTester // By Kelvin-Stokes / direct integration, PSA = pi/4 for any axis-aligned normal. { auto shape = createSphericalTriangleShape(float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1)); - const float64_t psaZ = static_cast(shape.projectedSolidAngle(float32_t3(0, 0, 1))); + const float64_t psaZ = std::abs(static_cast(shape.projectedSolidAngle(float32_t3(0, 0, 1)))); - // MC verification: sample many points uniformly from the octant triangle - const float64_t mcPSA = mcEstimatePSA(shape, float32_t3(0, 0, 1), 1000000, ctx.rng); + // Grid verification: evaluate abs(N.L) over a dense grid on the octant triangle + const float64_t gridPSA = gridEstimatePSA(shape, float32_t3(0, 0, 1), 1000000); - const float64_t formulaVsMC = std::abs(psaZ - mcPSA) / std::abs(mcPSA); - m_logger->log(" [PSA] octant z-normal: formula=%f expected(pi/4)=%f reference=%f relErr=%e", - system::ILogger::ELL_PERFORMANCE, psaZ, nbl::hlsl::numbers::pi / 4.0, mcPSA, formulaVsMC); + const float64_t formulaVsGrid = std::abs(psaZ - gridPSA) / std::abs(gridPSA); + m_logger->log(" [TriPSA] octant z-normal: formula=%f expected(pi/4)=%f reference=%f relErr=%e", + system::ILogger::ELL_PERFORMANCE, psaZ, nbl::hlsl::numbers::pi / 4.0, gridPSA, formulaVsGrid); - if (formulaVsMC > psaOctantMCRelTol) + if (formulaVsGrid > psaOctantGridRelTol) { - m_logger->log(" [PSA] octant z-normal FAILED: formula=%f expected(reference)=%f relErr=%e relTol=%e", - system::ILogger::ELL_ERROR, psaZ, mcPSA, formulaVsMC, psaOctantMCRelTol); + m_logger->log(" [TriPSA] octant z-normal FAILED: formula=%f expected(reference)=%f relErr=%e relTol=%e", + system::ILogger::ELL_ERROR, psaZ, gridPSA, formulaVsGrid, psaOctantGridRelTol); pass = false; } // Same octant, normal = (1,0,0): by symmetry same result as z-normal - const float64_t psaX = static_cast(shape.projectedSolidAngle(float32_t3(1, 0, 0))); + const float64_t psaX = std::abs(static_cast(shape.projectedSolidAngle(float32_t3(1, 0, 0)))); const float64_t relDiff = std::abs(psaZ - psaX) / std::max(psaZ, psaX); - m_logger->log(" [PSA] octant symmetry: psaZ=%f psaX=%f relDiff=%e", + m_logger->log(" [TriPSA] octant symmetry: psaZ=%f psaX=%f relDiff=%e", system::ILogger::ELL_PERFORMANCE, psaZ, psaX, relDiff); if (relDiff > psaSymmetryRelTol) { - m_logger->log(" [PSA] octant symmetry FAILED: psaZ=%f psaX=%f relDiff=%e relTol=%e", + m_logger->log(" [TriPSA] octant symmetry FAILED: psaZ=%f psaX=%f relDiff=%e relTol=%e", system::ILogger::ELL_ERROR, psaZ, psaX, relDiff, psaSymmetryRelTol); pass = false; } } if (pass) - m_logger->log(" [PSA] known cases PASSED (octant z-normal vs MC relTol=%e, octant symmetry z vs x relTol=%e)", - system::ILogger::ELL_PERFORMANCE, psaOctantMCRelTol, psaSymmetryRelTol); + m_logger->log(" [TriPSA] known cases PASSED (octant z-normal vs grid relTol=%e, octant symmetry z vs x relTol=%e)", + system::ILogger::ELL_PERFORMANCE, psaOctantGridRelTol, psaSymmetryRelTol); - return ctx.finalize(pass, m_logger, "PSA"); + return ctx.finalize(pass, m_logger, "TriPSA"); } - // Helper: run MC comparison of formulaPSA vs E[dot(L,N)]*SA for a set of triangle configs. + // Helper: run grid-integration comparison of formulaPSA vs PSA reference for a set of triangle configs. // TriConfigGen: void(rng, index, v0, v1, v2, normal) — generates triangle vertices + normal. template - bool testPSAVersusMonteCarlo(const char* label, TriConfigGen triConfigGenerator, uint32_t numConfigs, uint32_t mcSamples, float64_t relTol, float64_t absTol, bool diagnostic = false) + bool testPSAVersusGrid(const char* label, TriConfigGen triConfigGenerator, uint32_t numConfigs, uint32_t gridSamples, + float64_t relTol, float64_t absTol, float64_t hardRelTol, float64_t hardAbsTol, bool diagnostic = false) { - return ::testPSAVersusMonteCarlo(m_logger, "PSA", label, - [&](std::mt19937& rng, uint32_t c, float64_t& formulaPSA, float64_t& mcPSA, auto& logInfo) + return ::testPSAVersusGrid(m_logger, "TriPSA", label, + [&](std::mt19937& rng, uint32_t c, float64_t& formulaPSA, float64_t& gridPSA, auto& logInfo) { float32_t3 v0, v1, v2, normal; triConfigGenerator(rng, c, v0, v1, v2, normal); @@ -925,8 +974,8 @@ class CProjectedSphericalTriangleGeometricTester if (shape.solid_angle <= 0.0f || !std::isfinite(shape.solid_angle)) return; - formulaPSA = static_cast(shape.projectedSolidAngle(normal)); - mcPSA = mcEstimatePSA(shape, normal, mcSamples, rng); + formulaPSA = std::abs(static_cast(shape.projectedSolidAngle(normal))); + gridPSA = gridEstimatePSA(shape, normal, gridSamples); logInfo = [=](system::ILogger* logger, system::ILogger::E_LOG_LEVEL level) { using nbl::system::to_string; @@ -935,14 +984,14 @@ class CProjectedSphericalTriangleGeometricTester to_string(normal).c_str(), to_string(shape.solid_angle).c_str()); }; }, - numConfigs, relTol, absTol, diagnostic); + numConfigs, relTol, absTol, hardRelTol, hardAbsTol, diagnostic); } - // Small triangles -- PSA should approach MC ground truth + // Small triangles -- PSA should approach grid ground truth bool testPSASmallTriangle() { constexpr float64_t smallTriMeanRelErrTol = 0.1; - constexpr uint32_t smallTriMCSamples = 100000; + constexpr uint32_t smallTriGridSamples = 100000; SeededTestContext ctx; bool pass = true; @@ -973,27 +1022,27 @@ class CProjectedSphericalTriangleGeometricTester if (shape.solid_angle <= 0.0f || !std::isfinite(shape.solid_angle)) continue; - const float64_t formulaPSA = static_cast(shape.projectedSolidAngle(normal)); + const float64_t formulaPSA = std::abs(static_cast(shape.projectedSolidAngle(normal))); const float64_t sa = static_cast(shape.solid_angle); const float64_t centerNdotL = static_cast(dot(normal, baseDir)); if (std::abs(centerNdotL) < 0.1 || sa < 1e-10) continue; - // MC ground truth: E[abs(dot(L, N))] * solidAngle - const float64_t mcPSA = mcEstimatePSA(shape, normal, smallTriMCSamples, ctx.rng); + // Grid ground truth: mean over regular [0,1]^2 grid of abs(dot(L, N)) * solidAngle + const float64_t gridPSA = gridEstimatePSA(shape, normal, smallTriGridSamples); - if (std::abs(mcPSA) < 1e-10) + if (std::abs(gridPSA) < 1e-10) continue; - const float64_t relErr = (formulaPSA - mcPSA) / mcPSA; + const float64_t relErr = (formulaPSA - gridPSA) / gridPSA; sumRelErrPerSize[s] += relErr; validTrials[s]++; } } - m_logger->log(" [PSA] small triangle PSA vs MC (signed relErr, positive=overestimate):", system::ILogger::ELL_PERFORMANCE); + m_logger->log(" [TriPSA] small triangle PSA vs grid (signed relErr, positive=overestimate):", system::ILogger::ELL_PERFORMANCE); for (uint32_t s = 0; s < numSizes; s++) { if (validTrials[s] > 0) @@ -1005,14 +1054,14 @@ class CProjectedSphericalTriangleGeometricTester // Skip halfAngle=0.01 (s==5): float32 solid angle precision collapses if (s == 4 && std::abs(meanRelErr) > smallTriMeanRelErrTol) { - m_logger->log(" [PSA] small triangle exceeded tolerance at halfAngle=%.3f meanRelErr=%+e meanRelErrTol=%e (%u trials)", + m_logger->log(" [TriPSA] small triangle exceeded tolerance at halfAngle=%.3f meanRelErr=%+e meanRelErrTol=%e (%u trials)", system::ILogger::ELL_WARNING, halfAngles[s], meanRelErr, smallTriMeanRelErrTol, validTrials[s]); } } } - m_logger->log(" [PSA] small triangle test complete (%u trials across %u sizes, %u MC samples each, meanRelErrTol=%e) -- diagnostic only", - system::ILogger::ELL_PERFORMANCE, numTrials, numSizes, smallTriMCSamples, smallTriMeanRelErrTol); + m_logger->log(" [TriPSA] small triangle test complete (%u trials across %u sizes, %u grid samples each, meanRelErrTol=%e) -- diagnostic only", + system::ILogger::ELL_PERFORMANCE, numTrials, numSizes, smallTriGridSamples, smallTriMeanRelErrTol); return true; // diagnostic only -- abs()-based PSA overestimates, not a hard failure } @@ -1076,7 +1125,7 @@ class CProjectedSphericalTriangleGeometricTester if (!std::isfinite(sampler.sphtri.rcpSolidAngle) || sampler.sphtri.rcpSolidAngle <= 0.0f) continue; - const float64_t projSA = static_cast(shape.projectedSolidAngle(cfg.normal)); + const float64_t projSA = std::abs(static_cast(shape.projectedSolidAngle(cfg.normal))); const bool hasPSA = projSA > 0.0 && std::isfinite(projSA); const float64_t rcpPSA = hasPSA ? 1.0 / projSA : 0.0; MISStats& mis = isGrazing ? grazingMIS : normalMIS; @@ -1090,7 +1139,7 @@ class CProjectedSphericalTriangleGeometricTester float32_t3 L = sampler.generate(u, cache); const float64_t trueNdotL = std::max(0.0, static_cast(dot(cfg.normal, L))); - const float64_t bilinearNdotL = static_cast(cache.abs_cos_theta); + const float64_t bilinearNdotL = std::numeric_limits::quiet_NaN(); const float64_t pstPdf = static_cast(sampler.forwardPdf(u, cache)); // Bilinear vs true NdotL @@ -1323,7 +1372,7 @@ class CProjectedSphericalTriangleGeometricTester continue; auto sampler = createSampler(cfg); - const float64_t projSA = static_cast(shape.projectedSolidAngle(cfg.normal)); + const float64_t projSA = std::abs(static_cast(shape.projectedSolidAngle(cfg.normal))); if (projSA <= 0.0 || !std::isfinite(projSA) || !std::isfinite(sampler.sphtri.rcpSolidAngle) || sampler.sphtri.rcpSolidAngle <= 0.0f) @@ -1344,7 +1393,11 @@ class CProjectedSphericalTriangleGeometricTester if (trueNdotL < 1e-6) continue; - const float64_t pstPdf = static_cast(sampler.backwardPdf(L)); + // No direct backwardPdf; evaluate forwardPdf at the inverted u to recover pdf(L). + const float32_t2 uInv = sampler.sphtri.generateInverse(L); + typename sampling::ProjectedSphericalTriangle::cache_type pdfCache; + sampler.generate(uInv, pdfCache); + const float64_t pstPdf = static_cast(sampler.forwardPdf(uInv, pdfCache)); const float64_t idealPdf = trueNdotL * rcpPSA; if (!std::isfinite(pstPdf) || pstPdf <= 0.0 || idealPdf <= 0.0) @@ -1416,6 +1469,15 @@ struct UniformRectSamplerPolicy return sampler_type::create(shape, observer); } + // Returns offset-from-r0 on the rectangle surface. Goes through generateLocalBasisXY + // (absolute xy) and subtracts r0.xy so the [0, extents] bounds check still applies. + static float32_t2 generateOffset(sampler_type& s, const float32_t2& u) + { + typename sampler_type::cache_type cache; + const float32_t2 absXY = s.generateLocalBasisXY(u, cache); + return absXY - float32_t2(s.r0.x, s.r0.y); + } + static float getSolidAngle(const sampler_type& s) { return s.solidAngle; } static const char* name() { return "SphericalRectangle"; } @@ -1425,7 +1487,8 @@ struct UniformRectSamplerPolicy struct ProjectedRectSamplerPolicy { - using sampler_type = sampling::ProjectedSphericalRectangle; + // UsePdfAsWeight=false so receiverNormal and projSolidAngle are populated for diagnostic logs. + using sampler_type = sampling::ProjectedSphericalRectangle; static sampler_type createSampler(shapes::SphericalRectangle& shape, const float32_t3& observer, std::mt19937& rng) @@ -1439,6 +1502,17 @@ struct ProjectedRectSamplerPolicy return sampler_type::create(shape, observer, receiverNormal, false); } + // Run u through the bilinear warp then the inner sphrect's generateLocalBasisXY, and subtract + // r0.xy to get offset-from-r0 on the rectangle surface. + static float32_t2 generateOffset(sampler_type& s, const float32_t2& u) + { + typename sampling::Bilinear::cache_type bc; + const float32_t2 warped = s.bilinearPatch.generate(u, bc); + typename sampling::SphericalRectangle::cache_type sphrectCache; + const float32_t2 absXY = s.sphrect.generateLocalBasisXY(warped, sphrectCache); + return absXY - float32_t2(s.sphrect.r0.x, s.sphrect.r0.y); + } + static float getSolidAngle(const sampler_type& s) { return s.sphrect.solidAngle; } static const char* name() { return "ProjectedSphericalRectangle"; } @@ -1635,8 +1709,7 @@ class CRectangleGenerateTester for (uint32_t i = 0; i < numSamples; i++) { float32_t2 u(uDist(ctx.rng), uDist(ctx.rng)); - typename sampler_type::cache_type cache; - float32_t2 gen = sampler.generateSurfaceOffset(u, cache); + float32_t2 gen = Policy::generateOffset(sampler, u); const float coord = cutAlongX ? gen.x : gen.y; if (coord < cutThreshold) countInSub++; @@ -1714,8 +1787,7 @@ class CRectangleGenerateTester for (uint32_t i = 0; i < numSamples; i++) { float32_t2 u(uDist(ctx.rng), uDist(ctx.rng)); - typename sampler_type::cache_type cache; - float32_t2 gen = sampler.generateSurfaceOffset(u, cache); + float32_t2 gen = Policy::generateOffset(sampler, u); float32_t3 dir = reconstructDirection(compressed, shape.extents, observer, gen); sum += static_cast(dot(dir, N)); } @@ -1778,8 +1850,7 @@ class CRectangleGenerateTester for (uint32_t i = 0; i < numSamples; i++) { float32_t2 u(uDist(ctx.rng), uDist(ctx.rng)); - typename sampler_type::cache_type cache; - float32_t2 gen = sampler.generateSurfaceOffset(u, cache); + float32_t2 gen = Policy::generateOffset(sampler, u); if (gen.x < -1e-5f || gen.x > extX + 1e-5f || gen.y < -1e-5f || gen.y > extY + 1e-5f) { @@ -1891,9 +1962,9 @@ using CProjectedSphericalRectangleGenerateTester = CRectangleGenerateTester 3.0 AND absErr > 0.3) still catch catastrophic regressions. + bool pass = true; + pass &= testPSAVersusGrid("random", generateRandomRectangle, 200, 500000, 0.05, 0.01, 3.0, 0.3); + pass &= testPSAVersusGrid("grazing", generateStressRectangle, 200, 500000, 0.1, 0.01, 3.0, 0.3); + return pass; } private: // Reuse rectangle generators from CRectangleGenerateTester using RectGen = void(*)(std::mt19937&, shapes::CompressedSphericalRectangle&, float32_t3&); - bool testPSAVersusMonteCarlo(const char* label, RectGen rectGen, uint32_t numConfigs, uint32_t mcSamples, float64_t relTol, float64_t absTol) + bool testPSAVersusGrid(const char* label, RectGen rectGen, uint32_t numConfigs, uint32_t gridSamples, + float64_t relTol, float64_t absTol, float64_t hardRelTol, float64_t hardAbsTol) { - return ::testPSAVersusMonteCarlo(m_logger, "RectPSA", label, - [&](std::mt19937& rng, uint32_t, float64_t& formulaPSA, float64_t& mcPSA, auto& logInfo) + return ::testPSAVersusGrid(m_logger, "RectPSA", label, + [&](std::mt19937& rng, uint32_t, float64_t& formulaPSA, float64_t& gridPSA, auto& logInfo) { shapes::CompressedSphericalRectangle compressed; float32_t3 observer; @@ -1932,7 +2006,9 @@ class CProjectedSphericalRectangleGeometricTester float32_t3 normal = generateRandomUnitVector(rng); formulaPSA = static_cast(shape.projectedSolidAngle(observer, normal)); - mcPSA = mcEstimatePSA(shape, observer, normal, mcSamples, rng); + // surfaceGridEstimatePSA integrates over the rectangle surface directly (no sampler in + // the loop), so a formula-vs-reference mismatch here isolates the PSA formula. + gridPSA = surfaceGridEstimatePSA(shape, observer, normal, gridSamples); logInfo = [compressed, observer, normal, saValue = sa.value](system::ILogger* logger, system::ILogger::E_LOG_LEVEL level) { using nbl::system::to_string; @@ -1945,7 +2021,7 @@ class CProjectedSphericalRectangleGeometricTester to_string(saValue).c_str()); }; }, - numConfigs, relTol, absTol, true); + numConfigs, relTol, absTol, hardRelTol, hardAbsTol, true); } system::ILogger* m_logger; From a4559b941a9d0f465ccc8687630077e045829403 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Fri, 24 Apr 2026 21:22:28 +0300 Subject: [PATCH 4/5] alias table is packed, 2 versions, consolidated WORKGROUP_SIZE for tests and benchmarks, example 37 and 64 now use a single command buffer for benchmarks --- 37_HLSLSamplingTests/CMakeLists.txt | 31 +- .../app_resources/common/alias_table.hlsl | 102 ++- .../common/discrete_sampler_bench.hlsl | 14 +- .../common/spherical_triangle.hlsl | 2 +- .../shaders/alias_table_test.comp.hlsl | 74 --- .../shaders/bilinear_test.comp.hlsl | 4 - .../box_muller_transform_test.comp.hlsl | 4 - .../shaders/concentric_mapping_test.comp.hlsl | 4 - .../cumulative_probability_test.comp.hlsl | 15 +- .../shaders/linear_test.comp.hlsl | 4 - .../shaders/packed_alias_test.comp.hlsl | 114 ++++ .../shaders/polar_mapping_test.comp.hlsl | 4 - .../projected_hemisphere_test.comp.hlsl | 4 - .../shaders/projected_sphere_test.comp.hlsl | 4 - ...ojected_spherical_rectangle_test.comp.hlsl | 6 +- ...rojected_spherical_triangle_test.comp.hlsl | 4 - .../spherical_rectangle_test.comp.hlsl | 6 +- .../shaders/spherical_triangle.comp.hlsl | 4 +- .../shaders/test_compile.comp.hlsl | 10 - .../shaders/uniform_hemisphere_test.comp.hlsl | 4 - .../shaders/uniform_sphere_test.comp.hlsl | 4 - .../benchmarks/CDiscreteSamplerBenchmark.h | 386 +++++------ .../benchmarks/CSamplerBenchmark.h | 120 ++-- 37_HLSLSamplingTests/main.cpp | 189 +++--- .../tests/CAliasTableGPUTester.h | 52 +- 37_HLSLSamplingTests/tests/CBilinearTester.h | 2 +- .../tests/CBoxMullerTransformTester.h | 2 +- .../tests/CConcentricMappingTester.h | 2 +- .../tests/CCumulativeProbabilityGPUTester.h | 2 +- .../tests/CDiscreteTableTester.h | 608 +++++++++++------- 37_HLSLSamplingTests/tests/CLinearTester.h | 2 +- .../tests/CPolarMappingTester.h | 2 +- .../tests/CProjectedHemisphereTester.h | 2 +- .../tests/CProjectedSphereTester.h | 2 +- .../CProjectedSphericalRectangleTester.h | 2 +- .../tests/CProjectedSphericalTriangleTester.h | 2 +- .../tests/CSphericalRectangleTester.h | 2 +- .../tests/CSphericalTriangleTester.h | 2 +- .../tests/CUniformHemisphereTester.h | 2 +- .../tests/CUniformSphereTester.h | 2 +- 64_EmulatedFloatTest/main.cpp | 123 +--- 41 files changed, 1031 insertions(+), 893 deletions(-) delete mode 100644 37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl create mode 100644 37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl diff --git a/37_HLSLSamplingTests/CMakeLists.txt b/37_HLSLSamplingTests/CMakeLists.txt index 12cbb5bb1..e50fe4663 100644 --- a/37_HLSLSamplingTests/CMakeLists.txt +++ b/37_HLSLSamplingTests/CMakeLists.txt @@ -26,7 +26,7 @@ set(DEPENDS app_resources/shaders/projected_spherical_triangle_test.comp.hlsl app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl app_resources/shaders/spherical_rectangle_test.comp.hlsl - app_resources/shaders/alias_table_test.comp.hlsl + app_resources/shaders/packed_alias_test.comp.hlsl app_resources/shaders/cumulative_probability_test.comp.hlsl app_resources/common/linear.hlsl app_resources/common/uniform_hemisphere.hlsl @@ -91,7 +91,7 @@ endif() set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen") -set(BENCH_ITERS 2048) +set(BENCH_ITERS 128) set(WORKGROUP_SIZE 64) target_compile_definitions(${EXECUTABLE_NAME} PRIVATE @@ -99,7 +99,7 @@ target_compile_definitions(${EXECUTABLE_NAME} PRIVATE WORKGROUP_SIZE=${WORKGROUP_SIZE} ) -set(BENCH_OPTS "\"-DBENCH_ITERS=${BENCH_ITERS}\", \"-DWORKGROUP_SIZE=${WORKGROUP_SIZE}\"") +set(BENCH_OPTS "\"-DBENCH_ITERS=${BENCH_ITERS}\"") set(JSON " [ @@ -340,14 +340,24 @@ set(JSON " \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\", \"-DBENCH_VARIANT_R0_EXTENTS\"] }, { - \"INPUT\": \"app_resources/shaders/alias_table_test.comp.hlsl\", - \"KEY\": \"alias_table_test\" + \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\", + \"KEY\": \"packed_alias_a_test\" }, { - \"INPUT\": \"app_resources/shaders/alias_table_test.comp.hlsl\", - \"KEY\": \"alias_table_bench\", + \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\", + \"KEY\": \"packed_alias_b_test\", + \"COMPILE_OPTIONS\": [\"-DNBL_PACKED_ALIAS_B\"] + }, + { + \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\", + \"KEY\": \"packed_alias_a_bench\", \"COMPILE_OPTIONS\": [${BENCH_OPTS}] }, + { + \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\", + \"KEY\": \"packed_alias_b_bench\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_PACKED_ALIAS_B\"] + }, { \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", \"KEY\": \"cumulative_probability_test\" @@ -361,6 +371,11 @@ set(JSON " \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", \"KEY\": \"cumulative_probability_yolo_bench\", \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_CUMPROB_YOLO_READS\"] + }, + { + \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", + \"KEY\": \"cumulative_probability_eytzinger_bench\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_CUMPROB_EYTZINGER\"] } ] ") @@ -370,7 +385,7 @@ NBL_CREATE_NSC_COMPILE_RULES( LINK_TO ${EXECUTABLE_NAME} BINARY_DIR ${OUTPUT_DIRECTORY} MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT - COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR} -T cs_6_8 + COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR} -T cs_6_8 -DWORKGROUP_SIZE=${WORKGROUP_SIZE} OUTPUT_VAR KEYS INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp NAMESPACE nbl::this_example::builtin::build diff --git a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl index bb1ed54ef..08706408f 100644 --- a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl @@ -8,12 +8,28 @@ using namespace nbl::hlsl; NBL_CONSTEXPR uint32_t AliasTestTableSize = 4; +// Log2N = ceil_log2(N) minimises quantisation drift on the stayProb unorm +// (here 30 unorm bits, essentially lossless). +NBL_CONSTEXPR uint32_t AliasTestLog2N = 2; -using AliasTestProbAccessor = ArrayAccessor; -using AliasTestAliasAccessor = ArrayAccessor; -using AliasTestPdfAccessor = ArrayAccessor; +using AliasTestPdfAccessor = ArrayAccessor; +using AliasTestPackedWordAccessor = ArrayAccessor; -using AliasTestSampler = sampling::AliasTable; +// Dedicated struct-valued accessor for PackedAliasEntryB. Field-wise copy +// sidesteps HLSL's struct functional-cast ambiguity. +struct AliasTestEntryBAccessor +{ + using value_type = sampling::PackedAliasEntryB; + + template + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC + { + val.packedWord = data[i].packedWord; + val.ownPdf = data[i].ownPdf; + } + + value_type data[AliasTestTableSize]; +}; struct AliasTableInputValues { @@ -22,7 +38,7 @@ struct AliasTableInputValues struct AliasTableTestResults { - uint32_t generatedIndex; + uint32_t generatedIndex; float32_t forwardPdf; float32_t backwardPdf; float32_t forwardWeight; @@ -31,24 +47,55 @@ struct AliasTableTestResults }; // Pre-computed alias table for weights {1, 2, 3, 4}: -// pdf = {0.1, 0.2, 0.3, 0.4} -// prob = {0.4, 0.8, 1.0, 0.8} -// alias = {3, 3, 2, 2} -struct AliasTableTestExecutor +// pdf = {0.1, 0.2, 0.3, 0.4} +// stayProb = {0.4, 0.8, 1.0, 0.8} +// alias = {3, 3, 2, 2} +// +// Log2N = 2 unorm encoding (30 bits for stayProb, 2 bits for alias): +// packedWord = (alias & 0x3) | (round(stayProb * ((1u<<30) - 1)) << 2) +// bin 0: (3) | (429496729 << 2) = 0x66666667 +// bin 1: (3) | (858993458 << 2) = 0xCCCCCCCB +// bin 2: (2) | (1073741823 << 2) = 0xFFFFFFFE +// bin 3: (2) | (858993458 << 2) = 0xCCCCCCCA + +struct PackedAliasATestExecutor +{ + void operator()(NBL_CONST_REF_ARG(AliasTableInputValues) input, NBL_REF_ARG(AliasTableTestResults) output) + { + AliasTestPackedWordAccessor wordAcc; + wordAcc.data[0] = 0x66666667u; + wordAcc.data[1] = 0xCCCCCCCBu; + wordAcc.data[2] = 0xFFFFFFFEu; + wordAcc.data[3] = 0xCCCCCCCAu; + + AliasTestPdfAccessor pdfAcc; + pdfAcc.data[0] = 0.1f; + pdfAcc.data[1] = 0.2f; + pdfAcc.data[2] = 0.3f; + pdfAcc.data[3] = 0.4f; + + using Sampler = sampling::PackedAliasTableA; + Sampler sampler = Sampler::create(wordAcc, pdfAcc, AliasTestTableSize); + + Sampler::cache_type cache; + output.generatedIndex = sampler.generate(input.u, cache); + output.forwardPdf = sampler.forwardPdf(input.u, cache); + output.backwardPdf = sampler.backwardPdf(output.generatedIndex); + output.forwardWeight = sampler.forwardWeight(input.u, cache); + output.backwardWeight = sampler.backwardWeight(output.generatedIndex); + output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + } +}; + +struct PackedAliasBTestExecutor { void operator()(NBL_CONST_REF_ARG(AliasTableInputValues) input, NBL_REF_ARG(AliasTableTestResults) output) { - AliasTestProbAccessor probAcc; - probAcc.data[0] = 0.4f; - probAcc.data[1] = 0.8f; - probAcc.data[2] = 1.0f; - probAcc.data[3] = 0.8f; - - AliasTestAliasAccessor aliasAcc; - aliasAcc.data[0] = 3u; - aliasAcc.data[1] = 3u; - aliasAcc.data[2] = 2u; - aliasAcc.data[3] = 2u; + AliasTestEntryBAccessor entryAcc; + entryAcc.data[0].packedWord = 0x66666667u; entryAcc.data[0].ownPdf = 0.1f; + entryAcc.data[1].packedWord = 0xCCCCCCCBu; entryAcc.data[1].ownPdf = 0.2f; + entryAcc.data[2].packedWord = 0xFFFFFFFEu; entryAcc.data[2].ownPdf = 0.3f; + entryAcc.data[3].packedWord = 0xCCCCCCCAu; entryAcc.data[3].ownPdf = 0.4f; AliasTestPdfAccessor pdfAcc; pdfAcc.data[0] = 0.1f; @@ -56,14 +103,15 @@ struct AliasTableTestExecutor pdfAcc.data[2] = 0.3f; pdfAcc.data[3] = 0.4f; - AliasTestSampler sampler = AliasTestSampler::create(probAcc, aliasAcc, pdfAcc, AliasTestTableSize); + using Sampler = sampling::PackedAliasTableB; + Sampler sampler = Sampler::create(entryAcc, pdfAcc, AliasTestTableSize); - AliasTestSampler::cache_type cache; - output.generatedIndex = sampler.generate(input.u, cache); - output.forwardPdf = sampler.forwardPdf(input.u, cache); - output.backwardPdf = sampler.backwardPdf(output.generatedIndex); - output.forwardWeight = sampler.forwardWeight(input.u, cache); - output.backwardWeight = sampler.backwardWeight(output.generatedIndex); + Sampler::cache_type cache; + output.generatedIndex = sampler.generate(input.u, cache); + output.forwardPdf = sampler.forwardPdf(input.u, cache); + output.backwardPdf = sampler.backwardPdf(output.generatedIndex); + output.forwardWeight = sampler.forwardWeight(input.u, cache); + output.backwardWeight = sampler.backwardWeight(output.generatedIndex); output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; } }; diff --git a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl index d5c1d313c..198b72faf 100644 --- a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl @@ -7,18 +7,20 @@ using namespace nbl::hlsl; NBL_CONSTEXPR uint32_t WorkgroupSize = WORKGROUP_SIZE; -struct AliasTablePushConstants +struct CumProbPushConstants { - uint64_t probAddress; // float probability[N] - uint64_t aliasAddress; // uint32_t alias[N] - uint64_t pdfAddress; // float pdf[N] + uint64_t cumProbAddress; // float cumProb[N-1] uint64_t outputAddress; // uint32_t acc[threadCount] uint32_t tableSize; // N }; -struct CumProbPushConstants +// Variants A and B both take the entry array plus a separate pdf[] array +// (A: 4 B words, B: 8 B {packedWord, ownPdf}; pdf[] has the same contents in +// both but is tapped independently by the sampler). +struct PackedAliasABPushConstants { - uint64_t cumProbAddress; // float cumProb[N-1] + uint64_t entriesAddress; // A: uint32_t words[N] (4 B); B: PackedAliasEntryB[N] (8 B) + uint64_t pdfAddress; // float pdf[N] uint64_t outputAddress; // uint32_t acc[threadCount] uint32_t tableSize; // N }; diff --git a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl index 1828139d4..d3cd09326 100644 --- a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl @@ -59,7 +59,7 @@ struct SphericalTriangleTestExecutor output.backwardWeight = sampler.backwardWeight(output.generated); } // Roundtrip error: ||u - u'|| - output.roundtripError = nbl::hlsl::abs(input.u - output.inverted);. + output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 20.0f); // Domain preservation: diff --git a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl deleted file mode 100644 index 67047f997..000000000 --- a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl +++ /dev/null @@ -1,74 +0,0 @@ -#pragma shader_stage(compute) - -#include - -#ifdef BENCH_ITERS -#include "../common/discrete_sampler_bench.hlsl" -#include - -[[vk::push_constant]] AliasTablePushConstants pc; - -struct BdaProbabilityAccessor -{ - template && is_integral_v) - void get(I i, NBL_REF_ARG(V) val) { val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i)); } - uint64_t addr; -}; - -struct BdaAliasIndexAccessor -{ - template && is_integral_v) - void get(I i, NBL_REF_ARG(V) val) { val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i)); } - uint64_t addr; -}; - -struct BdaPdfAccessor -{ - template && is_integral_v) - void get(I i, NBL_REF_ARG(V) val) { val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i)); } - uint64_t addr; -}; - -using BenchAliasTable = sampling::AliasTable; -#else -#include "../common/alias_table.hlsl" - -[[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; -[[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; -#endif - -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif -[numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] -void main() -{ - const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; - -#ifdef BENCH_ITERS - BdaProbabilityAccessor probAcc; - probAcc.addr = pc.probAddress; - BdaAliasIndexAccessor aliasAcc; - aliasAcc.addr = pc.aliasAddress; - BdaPdfAccessor pdfAcc; - pdfAcc.addr = pc.pdfAddress; - BenchAliasTable sampler = BenchAliasTable::create(probAcc, aliasAcc, pdfAcc, pc.tableSize); - - float32_t xi = float32_t(nbl::hlsl::glsl::bitfieldReverse(invID)) / float32_t(~0u); - NBL_CONSTEXPR float32_t goldenRatio = 0.6180339887498949f; - uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) - { - xi = frac(xi + goldenRatio); - BenchAliasTable::cache_type cache; - uint32_t generated = sampler.generate(xi, cache); - acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache)); - } - - vk::RawBufferStore(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc); -#else - AliasTableTestExecutor executor; - executor(inputTestValues[invID], outputTestValues[invID]); -#endif -} diff --git a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl index 03ac7b36a..438eea31e 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl index 6189d4658..1fb5f6644 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl index 649c323b2..2a7f1861e 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl index 1091ee447..f06613b49 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl @@ -12,13 +12,18 @@ struct BdaCumProbAccessor { using value_type = float32_t; template - void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC { val = V(vk::RawBufferLoad(addr + uint64_t(sizeof(value_type)) * uint64_t(i))); } - value_type operator[](uint32_t i) NBL_CONST_MEMBER_FUNC { value_type v; get(i, v); return v; } + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC { val = V(vk::RawBufferLoad(addr + uint64_t(sizeof(value_type)) * uint64_t(i), sizeof(value_type))); } uint64_t addr; }; -using BenchCumProbSampler = sampling::CumulativeProbabilitySampler; +#if defined(NBL_CUMPROB_EYTZINGER) +using BenchCumProbSampler = sampling::CumulativeProbabilitySampler; +#elif defined(NBL_CUMPROB_YOLO_READS) +using BenchCumProbSampler = sampling::CumulativeProbabilitySampler; +#else +using BenchCumProbSampler = sampling::CumulativeProbabilitySampler; +#endif #else #include "../common/cumulative_probability.hlsl" @@ -26,11 +31,7 @@ using BenchCumProbSampler = sampling::CumulativeProbabilitySampler outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl index 17cf83ac5..7b97645b5 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl new file mode 100644 index 000000000..b0dbeedac --- /dev/null +++ b/37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl @@ -0,0 +1,114 @@ +#pragma shader_stage(compute) + +#include + +#ifdef BENCH_ITERS +#include "../common/discrete_sampler_bench.hlsl" +#include + +[[vk::push_constant]] PackedAliasABPushConstants pc; + +// Log2N bucket. Covers all sweep sizes up to 2^LOG2N buckets without precision +// loss. The same value must be passed to the host-side packA() / +// packB() call so the bit layouts match. +NBL_CONSTEXPR uint32_t LOG2N_BUCKET = 26; + +// Variant A accessor: 4 B packed words. +struct BdaPackedWordAccessor +{ + using value_type = uint32_t; + + template && is_integral_v) + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC + { + val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i), sizeof(V)); + } + + uint64_t addr; +}; + +// Variant B accessor: 8 B PackedAliasEntryB. Loads a uint2 and decomposes it +// into the POD entry so DXC never sees a bitfield — avoids the Insert/Extract +// round-trip we observed when the sampler read from a bitfield struct. +struct BdaPackedAliasBAccessor +{ + using value_type = nbl::hlsl::sampling::PackedAliasEntryB; + + template) + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC + { + const uint64_t loadAddr = addr + uint64_t(8u) * uint64_t(i); + const uint2 raw = vk::RawBufferLoad(loadAddr, 8u); + val.packedWord = raw.x; + val.ownPdf = asfloat(raw.y); + } + + uint64_t addr; +}; + +// Separate 4 B pdf[] accessor. +struct BdaPdfAccessor +{ + using value_type = float32_t; + + template && is_integral_v) + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC + { + val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i), sizeof(V)); + } + + uint64_t addr; +}; + +#ifdef NBL_PACKED_ALIAS_B +using BenchPackedAlias = nbl::hlsl::sampling::PackedAliasTableB; +#else +using BenchPackedAlias = nbl::hlsl::sampling::PackedAliasTableA; +#endif + +#else +#include "../common/alias_table.hlsl" + +[[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; +[[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; +#endif + +[numthreads(WORKGROUP_SIZE, 1, 1)] +void main() +{ + const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; + +#ifdef BENCH_ITERS +#ifdef NBL_PACKED_ALIAS_B + BdaPackedAliasBAccessor entryAcc; +#else + BdaPackedWordAccessor entryAcc; +#endif + entryAcc.addr = pc.entriesAddress; + BdaPdfAccessor pdfAcc; + pdfAcc.addr = pc.pdfAddress; + BenchPackedAlias sampler = BenchPackedAlias::create(entryAcc, pdfAcc, pc.tableSize); + + float32_t xi = float32_t(nbl::hlsl::glsl::bitfieldReverse(invID)) / float32_t(~0u); + NBL_CONSTEXPR float32_t goldenRatio = 0.6180339887498949f; + uint32_t acc = 0u; + + [loop] + for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + { + xi = frac(xi + goldenRatio); + BenchPackedAlias::cache_type cache; + uint32_t generated = sampler.generate(xi, cache); + acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache)); + } + + vk::RawBufferStore(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc); +#else +#ifdef NBL_PACKED_ALIAS_B + PackedAliasBTestExecutor executor; +#else + PackedAliasATestExecutor executor; +#endif + executor(inputTestValues[invID], outputTestValues[invID]); +#endif +} diff --git a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl index e0cf7aea0..b5d48cc36 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl index d1ef313e5..f543d6dc2 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl index 9b8c234c4..ca4e7eef7 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl index ca9b4d43e..fc4ae03b7 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl @@ -17,12 +17,8 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void -main() +void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl index 3d8ec8961..e32251ed8 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl index b9766d5ff..542d20587 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl @@ -17,12 +17,8 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void -main() +void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl index 3595ac86a..bc55facbd 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl @@ -15,9 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif + [numthreads(WORKGROUP_SIZE, 1, 1)] void main() { diff --git a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl index cd43c630e..3c832e995 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl @@ -190,16 +190,6 @@ void main() aliasIdx.data[0] = 1u; aliasIdx.data[1] = 2u; aliasIdx.data[2] = 3u; aliasIdx.data[3] = 0u; ArrayAccessor aliasPdf; aliasPdf.data[0] = 0.25; aliasPdf.data[1] = 0.25; aliasPdf.data[2] = 0.25; aliasPdf.data[3] = 0.25; - sampling::AliasTable, ArrayAccessor, ArrayAccessor > aliasTable = - sampling::AliasTable, ArrayAccessor, ArrayAccessor >::create(aliasProb, aliasIdx, aliasPdf, 4u); - sampling::AliasTable, ArrayAccessor, ArrayAccessor >::cache_type aliasCache; - uint32_t aliasBin0 = aliasTable.generate(0.3); - uint32_t aliasBin = aliasTable.generate(0.3, aliasCache); - acc.x += float32_t(aliasBin0 + aliasBin); - acc.x += aliasTable.forwardPdf(0.3, aliasCache); - acc.x += aliasTable.forwardWeight(0.3, aliasCache); - acc.x += aliasTable.backwardPdf(aliasBin); - acc.x += aliasTable.backwardWeight(aliasBin); // CumulativeProbabilitySampler — generate (with/without cache), forwardPdf, backwardPdf, forwardWeight, backwardWeight ArrayAccessor cumProb; diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl index 3c43ee119..c0a0e58b2 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl index 5879e28bb..1c810afbf 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h index 02fbf58d2..b2a2fad9a 100644 --- a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h @@ -11,97 +11,103 @@ using namespace nbl; -// Benchmarks alias table vs cumulative probability sampler on the GPU using BDA. -// Builds pipelines once, then sweeps a list of table sizes. For each N it builds -// both tables from the same weight distribution, uploads via BDA buffers, and -// measures GPU throughput using timestamp queries. The cumulative probability -// sampler is run in two variants: the stateful-comparator cache population -// (default) and the "YOLO re-read" variant (cumulative_probability.hlsl). class CDiscreteSamplerBenchmark { public: struct SetupData { - core::smart_refctd_ptr device; - core::smart_refctd_ptr api; - core::smart_refctd_ptr assetMgr; - core::smart_refctd_ptr logger; - video::IPhysicalDevice* physicalDevice; - std::string aliasShaderKey; - std::string cumProbShaderKey; - std::string cumProbYoloShaderKey; - uint32_t computeFamilyIndex; - uint32_t dispatchGroupCount; + core::smart_refctd_ptr device; + core::smart_refctd_ptr api; + core::smart_refctd_ptr assetMgr; + core::smart_refctd_ptr logger; + IPhysicalDevice* physicalDevice; + std::string packedAliasAShaderKey; + std::string packedAliasBShaderKey; + std::string cumProbShaderKey; + std::string cumProbYoloShaderKey; + std::string cumProbEytzingerShaderKey; + uint32_t computeFamilyIndex; + uint32_t dispatchGroupCount; }; void setup(const SetupData& data) { - m_device = data.device; - m_logger = data.logger; - m_assetMgr = data.assetMgr; + m_device = data.device; + m_logger = data.logger; + m_assetMgr = data.assetMgr; m_dispatchGroupCount = data.dispatchGroupCount; - m_physicalDevice = data.physicalDevice; + m_physicalDevice = data.physicalDevice; m_queue = m_device->getQueue(data.computeFamilyIndex, 0); + // Staging-upload utility. Without this, BDA buffers land in host-visible (system RAM) + // and every sampler load becomes a PCIe round-trip instead of hitting VRAM/L2. + m_utils = IUtilities::create(core::smart_refctd_ptr(m_device), core::smart_refctd_ptr(m_logger)); + // Command pool + buffers - m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchCmdbuf); + m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchCmdbuf); // Timestamp query pool { - video::IQueryPool::SCreationParams qp = {}; - qp.queryType = video::IQueryPool::TYPE::TIMESTAMP; - qp.queryCount = 2; - qp.pipelineStatisticsFlags = video::IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; - m_queryPool = m_device->createQueryPool(qp); + IQueryPool::SCreationParams qp = {}; + qp.queryType = IQueryPool::TYPE::TIMESTAMP; + qp.queryCount = 2; + qp.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + m_queryPool = m_device->createQueryPool(qp); } const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE; - // Shared output buffer (size only depends on thread count) + // Shared output buffer (size only depends on thread count). GPU writes via BDA and + // nothing reads it on the CPU, so pin it to device-local VRAM. { - video::IGPUBuffer::SCreationParams bp = {}; - bp.size = totalThreads * sizeof(uint32_t); - bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | - video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - m_outputBuf = m_device->createBuffer(std::move(bp)); - video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputBuf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits(); - m_device->allocate(reqs, m_outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + IGPUBuffer::SCreationParams bp = {}; + bp.size = totalThreads * sizeof(uint32_t); + bp.usage = core::bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + m_outputBuf = m_device->createBuffer(std::move(bp)); + IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputBuf->getMemoryReqs(); + reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits(); + m_device->allocate(reqs, m_outputBuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); } // Pipelines (N-independent; only push constants change per run) - m_aliasPipeline = createPipeline(data.aliasShaderKey, m_aliasPplnLayout, "alias"); - m_cumProbPipeline = createPipeline(data.cumProbShaderKey, m_cumProbPplnLayout, "cumprob-comparator"); - m_cumProbYoloPipeline = createPipeline(data.cumProbYoloShaderKey, m_cumProbYoloPplnLayout, "cumprob-yolo"); + m_packedAliasAPipeline = createPipeline(data.packedAliasAShaderKey, m_packedAliasAPplnLayout, "alias-packed-A"); + m_packedAliasBPipeline = createPipeline(data.packedAliasBShaderKey, m_packedAliasBPplnLayout, "alias-packed-B"); + m_cumProbPipeline = createPipeline(data.cumProbShaderKey, m_cumProbPplnLayout, "cumprob-comparator"); + m_cumProbYoloPipeline = createPipeline(data.cumProbYoloShaderKey, m_cumProbYoloPplnLayout, "cumprob-yolo"); + m_cumProbEytzingerPipeline = createPipeline(data.cumProbEytzingerShaderKey, m_cumProbEytzingerPplnLayout, "cumprob-eytzinger"); } // DispatchScheduler: uint32_t N -> std::pair. // Lets the caller trade wall-clock for statistical stability per size: // big-N runs are DRAM-bound and need fewer dispatches to hit the same total sample count. - struct DispatchCounts { uint32_t warmup; uint32_t bench; }; + struct DispatchCounts + { + uint32_t warmup; + uint32_t bench; + }; - // Sweep a list of table sizes. For each N: build tables from a fresh weight - // distribution (deterministic seed = 42 + N so different N's get distinct - // distributions but runs are reproducible), upload via BDA, then run all - // three samplers with the dispatch counts chosen by `scheduler`. template void runSweep(const std::vector& tableSizes, DispatchScheduler scheduler) { const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE; m_logger->log("=== GPU Discrete Sampler Benchmark sweep (%u threads * %u iters/thread; wg=%u; dispatches chosen per-N) ===", - system::ILogger::ELL_PERFORMANCE, totalThreads, BENCH_ITERS, WORKGROUP_SIZE); - m_logger->log("%12s | %-28s | %12s | %12s | %12s | %10s", - system::ILogger::ELL_PERFORMANCE, "N", "Sampler", "ps/sample", "GSamples/s", "ms total", "dispatches"); + ILogger::ELL_PERFORMANCE, totalThreads, BENCH_ITERS, WORKGROUP_SIZE); + m_logger->log("%12s | %-34s | %12s | %12s | %12s | %10s", ILogger::ELL_PERFORMANCE, + "N", "Sampler", "ps/sample", "GSamples/s", "ms total", "dispatches"); for (uint32_t N : tableSizes) { const DispatchCounts dc = scheduler(N); buildAndUpload(N); - runSingle(N, "AliasTable", m_aliasPipeline, m_aliasPplnLayout, SamplerKind::Alias, dc.warmup, dc.bench); - runSingle(N, "CumulativeProbability", m_cumProbPipeline, m_cumProbPplnLayout, SamplerKind::CumProbCompare, dc.warmup, dc.bench); - runSingle(N, "CumulativeProbability (YOLO)", m_cumProbYoloPipeline, m_cumProbYoloPplnLayout, SamplerKind::CumProbYolo, dc.warmup, dc.bench); + // Packed A wins N<=16k; Packed B wins N>=32k. SoA and Packed C were dominated + // across every N measured, removed from the sweep. + runSingle(N, "AliasTable (packed A, 4 B)", m_packedAliasAPipeline, m_packedAliasAPplnLayout, SamplerKind::AliasPackedA, dc.warmup, dc.bench); + runSingle(N, "AliasTable (packed B, 8 B)", m_packedAliasBPipeline, m_packedAliasBPplnLayout, SamplerKind::AliasPackedB, dc.warmup, dc.bench); + runSingle(N, "CumulativeProbability", m_cumProbPipeline, m_cumProbPplnLayout, SamplerKind::CumProbCompare, dc.warmup, dc.bench); + runSingle(N, "CumulativeProbability (YOLO)", m_cumProbYoloPipeline, m_cumProbYoloPplnLayout, SamplerKind::CumProbYolo, dc.warmup, dc.bench); + runSingle(N, "CumulativeProbability (Eytzinger)", m_cumProbEytzingerPipeline, m_cumProbEytzingerPplnLayout, SamplerKind::CumProbEytzinger, dc.warmup, dc.bench); releaseTables(); } } @@ -109,76 +115,74 @@ class CDiscreteSamplerBenchmark // Convenience: sweep with fixed dispatch counts for every size. void runSweep(const std::vector& tableSizes, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) { - runSweep(tableSizes, [warmupIterations, benchmarkIterations](uint32_t) -> DispatchCounts { - return {warmupIterations, benchmarkIterations}; - }); + runSweep(tableSizes, [warmupIterations, benchmarkIterations](uint32_t) -> DispatchCounts + { return {warmupIterations, benchmarkIterations}; }); } private: - enum class SamplerKind { Alias, CumProbCompare, CumProbYolo }; + enum class SamplerKind + { + AliasPackedA, + AliasPackedB, + CumProbCompare, + CumProbYolo, + CumProbEytzinger + }; template - core::smart_refctd_ptr createPipeline(const std::string& shaderKey, core::smart_refctd_ptr& outLayout, const char* tag) + core::smart_refctd_ptr createPipeline(const std::string& shaderKey, core::smart_refctd_ptr& outLayout, const char* tag) { - const asset::SPushConstantRange pcRange = { - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, - .offset = 0, - .size = sizeof(PushConstantT)}; + const SPushConstantRange pcRange = { + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .offset = 0, + .size = sizeof(PushConstantT)}; auto layout = m_device->createPipelineLayout({&pcRange, 1}); if (!layout) - m_logger->log("CDiscreteSamplerBenchmark: failed to create %s pipeline layout", system::ILogger::ELL_ERROR, tag); - - asset::IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; - auto bundle = m_assetMgr->getAsset(shaderKey, lp); - auto source = asset::IAsset::castDown(bundle.getContents()[0]); - auto shader = m_device->compileShader({.source = source.get()}); + m_logger->log("CDiscreteSamplerBenchmark: failed to create %s pipeline layout", ILogger::ELL_ERROR, tag); + + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; + auto bundle = m_assetMgr->getAsset(shaderKey, lp); + auto source = IAsset::castDown(bundle.getContents()[0]); + auto shader = m_device->compileShader({.source = source.get()}); if (!shader) - m_logger->log("CDiscreteSamplerBenchmark: failed to load %s shader", system::ILogger::ELL_ERROR, tag); + m_logger->log("CDiscreteSamplerBenchmark: failed to load %s shader", ILogger::ELL_ERROR, tag); - video::IGPUComputePipeline::SCreationParams pp = {}; - pp.layout = layout.get(); - pp.shader.shader = shader.get(); - pp.shader.entryPoint = "main"; + IGPUComputePipeline::SCreationParams pp = {}; + pp.layout = layout.get(); + pp.shader.shader = shader.get(); + pp.shader.entryPoint = "main"; if (m_device->getEnabledFeatures().pipelineExecutableInfo) { - pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; + pp.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; } - core::smart_refctd_ptr pipeline; + core::smart_refctd_ptr pipeline; if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &pipeline)) - m_logger->log("CDiscreteSamplerBenchmark: failed to create %s compute pipeline", system::ILogger::ELL_ERROR, tag); + m_logger->log("CDiscreteSamplerBenchmark: failed to create %s compute pipeline", ILogger::ELL_ERROR, tag); if (m_device->getEnabledFeatures().pipelineExecutableInfo) { auto report = system::to_string(pipeline->getExecutableInfo()); - m_logger->log("%s Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, tag, report.c_str()); + m_logger->log("%s Sampling Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, tag, report.c_str()); } outLayout = std::move(layout); return pipeline; } - core::smart_refctd_ptr createBdaBuffer(const void* srcData, size_t bytes) + core::smart_refctd_ptr createBdaBuffer(const void* srcData, size_t bytes) { - video::IGPUBuffer::SCreationParams bp = {}; - bp.size = bytes; - bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | - video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - auto buf = m_device->createBuffer(std::move(bp)); - - video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buf->getMemoryReqs(); - reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); - auto alloc = m_device->allocate(reqs, buf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - - const auto allocSize = alloc.memory->getAllocationSize(); - if (alloc.memory->map({0ull, allocSize}, video::IDeviceMemoryAllocation::EMCAF_WRITE)) - { - std::memcpy(alloc.memory->getMappedPointer(), srcData, bytes); - video::ILogicalDevice::MappedMemoryRange flushRange(alloc.memory.get(), 0ull, allocSize); - m_device->flushMappedMemoryRanges(1u, &flushRange); - alloc.memory->unmap(); - } + IGPUBuffer::SCreationParams bp = {}; + bp.size = bytes; + bp.usage = core::bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | + IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | + IGPUBuffer::EUF_TRANSFER_DST_BIT; + + core::smart_refctd_ptr buf; + auto future = m_utils->createFilledDeviceLocalBufferOnDedMem( + SIntendedSubmitInfo {.queue = m_queue}, std::move(bp), srcData); + future.move_into(buf); return buf; } @@ -186,46 +190,53 @@ class CDiscreteSamplerBenchmark { m_currentN = N; - std::vector weights(N); - std::mt19937 rng(42u + N); + std::vector weights(N); + std::mt19937 rng(42u + N); std::uniform_real_distribution dist(0.001f, 100.0f); for (uint32_t i = 0; i < N; i++) weights[i] = dist(rng); - // Alias table - std::vector aliasProb(N); - std::vector aliasIdx(N); - std::vector aliasPdf(N); - std::vector workspace(N); - nbl::hlsl::sampling::AliasTableBuilder::build({weights}, aliasProb.data(), aliasIdx.data(), aliasPdf.data(), workspace.data()); + // Build the alias table SoA (intermediate form), then pack it for variants A and B. + // Builder may pad PoT N to N+1 for cache-friendly stride; returned size drives + // every downstream buffer / push-constant value. + std::vector aliasProb; + std::vector aliasIdx; + std::vector aliasPdf; + m_aliasTableN = sampling::AliasTableBuilder::build({weights}, aliasProb, aliasIdx, aliasPdf); + + constexpr uint32_t kPackedLog2N = 26u; + std::vector packedA(m_aliasTableN); + std::vector> packedB(m_aliasTableN); + sampling::AliasTableBuilder::packA({aliasProb}, {aliasIdx}, packedA.data()); + sampling::AliasTableBuilder::packB({aliasProb}, {aliasIdx}, {aliasPdf}, packedB.data()); // Cumulative probability (N-1 entries, last bucket implicitly 1.0) - std::vector cumProb(N > 0 ? N - 1 : 0); - nbl::hlsl::sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data()); - - m_aliasProbBuf = createBdaBuffer(aliasProb.data(), N * sizeof(float)); - m_aliasIdxBuf = createBdaBuffer(aliasIdx.data(), N * sizeof(uint32_t)); - m_aliasPdfBuf = createBdaBuffer(aliasPdf.data(), N * sizeof(float)); - const size_t cumProbBytes = (N > 0 ? (N - 1) : 0) * sizeof(float); - m_cumProbBuf = cumProbBytes ? createBdaBuffer(cumProb.data(), cumProbBytes) : nullptr; + std::vector cumProb(N - 1u); + sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data()); + + // Eytzinger level-order tree: 2*P entries where P = nextPot(N) + const uint32_t eytzingerP = sampling::eytzingerLeafCount(N); + const uint32_t eytzingerTreeSize = 2u * eytzingerP; + std::vector cumProbEytzinger(eytzingerTreeSize); + sampling::buildEytzinger({weights}, cumProbEytzinger.data()); + + m_aliasPdfBuf = createBdaBuffer(aliasPdf.data(), m_aliasTableN * sizeof(float)); + m_packedAliasABuf = createBdaBuffer(packedA.data(), m_aliasTableN * sizeof(uint32_t)); + m_packedAliasBBuf = createBdaBuffer(packedB.data(), m_aliasTableN * sizeof(sampling::PackedAliasEntryB)); + m_cumProbBuf = createBdaBuffer(cumProb.data(), (N - 1u) * sizeof(float)); + m_cumProbEytzingerBuf = createBdaBuffer(cumProbEytzinger.data(), eytzingerTreeSize * sizeof(float)); } void releaseTables() { - m_aliasProbBuf = nullptr; - m_aliasIdxBuf = nullptr; - m_aliasPdfBuf = nullptr; - m_cumProbBuf = nullptr; + m_aliasPdfBuf = nullptr; + m_packedAliasABuf = nullptr; + m_packedAliasBBuf = nullptr; + m_cumProbBuf = nullptr; + m_cumProbEytzingerBuf = nullptr; } - void runSingle( - uint32_t N, - const char* name, - const core::smart_refctd_ptr& pipeline, - const core::smart_refctd_ptr& layout, - SamplerKind kind, - uint32_t warmupIterations, - uint32_t benchmarkIterations) + void runSingle(uint32_t N, const char* name, const core::smart_refctd_ptr& pipeline, const core::smart_refctd_ptr& layout, SamplerKind kind, uint32_t warmupIterations, uint32_t benchmarkIterations) { m_device->waitIdle(); @@ -241,96 +252,103 @@ class CDiscreteSamplerBenchmark // don't measure a tail where the GPU is already winding down. const uint32_t cooldownIterations = warmupIterations; - m_benchCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_benchCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_benchCmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); + m_benchCmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); m_benchCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); m_benchCmdbuf->bindComputePipeline(pipeline.get()); - if (kind == SamplerKind::Alias) + if (kind == SamplerKind::AliasPackedA || kind == SamplerKind::AliasPackedB) { - AliasTablePushConstants pc = {}; - pc.probAddress = m_aliasProbBuf->getDeviceAddress(); - pc.aliasAddress = m_aliasIdxBuf->getDeviceAddress(); - pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress(); - pc.outputAddress = m_outputBuf->getDeviceAddress(); - pc.tableSize = N; - m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + PackedAliasABPushConstants pc = {}; + pc.entriesAddress = (kind == SamplerKind::AliasPackedA ? m_packedAliasABuf : m_packedAliasBBuf)->getDeviceAddress(); + pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress(); + pc.outputAddress = m_outputBuf->getDeviceAddress(); + pc.tableSize = m_aliasTableN; + m_benchCmdbuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); } else { - CumProbPushConstants pc = {}; - pc.cumProbAddress = m_cumProbBuf ? m_cumProbBuf->getDeviceAddress() : 0ull; - pc.outputAddress = m_outputBuf->getDeviceAddress(); - pc.tableSize = N; - m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + CumProbPushConstants pc = {}; + const auto& buf = (kind == SamplerKind::CumProbEytzinger) ? m_cumProbEytzingerBuf : m_cumProbBuf; + pc.cumProbAddress = buf->getDeviceAddress(); + pc.outputAddress = m_outputBuf->getDeviceAddress(); + pc.tableSize = N; + m_benchCmdbuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); } for (uint32_t i = 0u; i < warmupIterations; ++i) m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); + m_benchCmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); for (uint32_t i = 0u; i < benchmarkIterations; ++i) m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + m_benchCmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); for (uint32_t i = 0u; i < cooldownIterations; ++i) m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); m_benchCmdbuf->end(); - auto semaphore = m_device->createSemaphore(0u); - const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_benchCmdbuf.get()}}; - const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { - {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - video::IQueue::SSubmitInfo submit = {}; - submit.commandBuffers = benchCmds; - submit.signalSemaphores = signalSem; + auto semaphore = m_device->createSemaphore(0u); + const IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_benchCmdbuf.get()}}; + const IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { + {.semaphore = semaphore.get(), .value = 1u, .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; + IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = benchCmds; + submit.signalSemaphores = signalSem; m_queue->submit({&submit, 1u}); m_device->waitIdle(); - uint64_t timestamps[2] = {}; - const auto flags = core::bitflag(video::IQueryPool::RESULTS_FLAGS::_64_BIT) | - core::bitflag(video::IQueryPool::RESULTS_FLAGS::WAIT_BIT); + uint64_t timestamps[2] = {}; + const auto flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | + core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags); - constexpr uint32_t benchIters = BENCH_ITERS; - const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); - const float64_t elapsed_ns = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod; - const uint64_t totalThreads = uint64_t(m_dispatchGroupCount) * uint64_t(WORKGROUP_SIZE); - const uint64_t totalSamples = uint64_t(benchmarkIterations) * totalThreads * uint64_t(benchIters); - const float64_t ps_per_sample = elapsed_ns * 1e3 / float64_t(totalSamples); - const float64_t gsamples_per_s = float64_t(totalSamples) / elapsed_ns; - const float64_t elapsed_ms = elapsed_ns * 1e-6; - - m_logger->log("%12u | %-28s | %12.3f | %12.3f | %12.3f | %10u", - system::ILogger::ELL_PERFORMANCE, N, name, ps_per_sample, gsamples_per_s, elapsed_ms, benchmarkIterations); + constexpr uint32_t benchIters = BENCH_ITERS; + const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); + const float64_t elapsed_ns = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod; + const uint64_t totalThreads = uint64_t(m_dispatchGroupCount) * uint64_t(WORKGROUP_SIZE); + const uint64_t totalSamples = uint64_t(benchmarkIterations) * totalThreads * uint64_t(benchIters); + const float64_t ps_per_sample = elapsed_ns * 1e3 / float64_t(totalSamples); + const float64_t gsamples_per_s = float64_t(totalSamples) / elapsed_ns; + const float64_t elapsed_ms = elapsed_ns * 1e-6; + + m_logger->log("%12u | %-34s | %12.3f | %12.3f | %12.3f | %10u", + ILogger::ELL_PERFORMANCE, N, name, ps_per_sample, gsamples_per_s, elapsed_ms, benchmarkIterations); } - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_logger; - core::smart_refctd_ptr m_assetMgr; - core::smart_refctd_ptr m_cmdpool; - core::smart_refctd_ptr m_benchCmdbuf; - core::smart_refctd_ptr m_queryPool; + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_logger; + core::smart_refctd_ptr m_assetMgr; + core::smart_refctd_ptr m_utils; + core::smart_refctd_ptr m_cmdpool; + core::smart_refctd_ptr m_benchCmdbuf; + core::smart_refctd_ptr m_queryPool; // Pipelines (set up once) - core::smart_refctd_ptr m_aliasPplnLayout; - core::smart_refctd_ptr m_aliasPipeline; - core::smart_refctd_ptr m_cumProbPplnLayout; - core::smart_refctd_ptr m_cumProbPipeline; - core::smart_refctd_ptr m_cumProbYoloPplnLayout; - core::smart_refctd_ptr m_cumProbYoloPipeline; - - // Per-N data buffers (rebuilt each sweep step) - core::smart_refctd_ptr m_aliasProbBuf; - core::smart_refctd_ptr m_aliasIdxBuf; - core::smart_refctd_ptr m_aliasPdfBuf; - core::smart_refctd_ptr m_cumProbBuf; + core::smart_refctd_ptr m_packedAliasAPplnLayout; + core::smart_refctd_ptr m_packedAliasAPipeline; + core::smart_refctd_ptr m_packedAliasBPplnLayout; + core::smart_refctd_ptr m_packedAliasBPipeline; + core::smart_refctd_ptr m_cumProbPplnLayout; + core::smart_refctd_ptr m_cumProbPipeline; + core::smart_refctd_ptr m_cumProbYoloPplnLayout; + core::smart_refctd_ptr m_cumProbYoloPipeline; + core::smart_refctd_ptr m_cumProbEytzingerPplnLayout; + core::smart_refctd_ptr m_cumProbEytzingerPipeline; + + // Per-N data buffers (rebuilt each sweep step). pdf[] is shared between A and B. + core::smart_refctd_ptr m_aliasPdfBuf; + core::smart_refctd_ptr m_packedAliasABuf; + core::smart_refctd_ptr m_packedAliasBBuf; + core::smart_refctd_ptr m_cumProbBuf; + core::smart_refctd_ptr m_cumProbEytzingerBuf; // Shared - core::smart_refctd_ptr m_outputBuf; - video::IQueue* m_queue = nullptr; - video::IPhysicalDevice* m_physicalDevice = nullptr; - uint32_t m_dispatchGroupCount = 0; - uint32_t m_currentN = 0; + core::smart_refctd_ptr m_outputBuf; + IQueue* m_queue = nullptr; + IPhysicalDevice* m_physicalDevice = nullptr; + uint32_t m_dispatchGroupCount = 0; + uint32_t m_currentN = 0; + uint32_t m_aliasTableN = 0; }; #endif diff --git a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h index 9f9854ac5..d95d7f103 100644 --- a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h @@ -35,14 +35,12 @@ class CSamplerBenchmark m_logger = data.logger; m_dispatchGroupCount = data.dispatchGroupCount; - // Command pool + 3 command buffers: benchmark (multi-submit), before/after timestamp + // Single cmdbuf holds [warmup dispatches][ts 0][bench dispatches][ts 1][cooldown dispatches] + // so the driver can pipeline adjacent dispatches and the trailing bench dispatches + // aren't measured in a winding-down tail. m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchmarkCmdbuf)) m_logger->log("CSamplerBenchmark: failed to create benchmark cmdbuf", system::ILogger::ELL_ERROR); - if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdbuf)) - m_logger->log("CSamplerBenchmark: failed to create timestamp-before cmdbuf", system::ILogger::ELL_ERROR); - if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdbuf)) - m_logger->log("CSamplerBenchmark: failed to create timestamp-after cmdbuf", system::ILogger::ELL_ERROR); // Timestamp query pool (2 queries: before and after) { @@ -101,26 +99,22 @@ class CSamplerBenchmark m_executableReport = system::to_string(m_pipeline->getExecutableInfo()); } - // Allocate input buffer (host-visible, zero-filled, correctness irrelevant for benchmarking) + // Allocate input buffer (device-local VRAM, zero-filled via cmdFillBuffer; correctness + // irrelevant for benchmarking but we want deterministic input, not garbage) core::smart_refctd_ptr inputBuf; { video::IGPUBuffer::SCreationParams bparams = {}; bparams.size = data.inputBufferBytes; - bparams.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + bparams.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_TRANSFER_DST_BIT; inputBuf = m_device->createBuffer(std::move(bparams)); video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits(); + reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits(); m_inputAlloc = m_device->allocate(reqs, inputBuf.get(), video::IDeviceMemoryAllocation::EMAF_NONE); if (!m_inputAlloc.isValid()) m_logger->log("CSamplerBenchmark: failed to allocate input buffer memory", system::ILogger::ELL_ERROR); - if (m_inputAlloc.memory->map({ 0ull, m_inputAlloc.memory->getAllocationSize() }, video::IDeviceMemoryAllocation::EMCAF_READ)) - { - std::memset(m_inputAlloc.memory->getMappedPointer(), 0, m_inputAlloc.memory->getAllocationSize()); - m_inputAlloc.memory->unmap(); - } } - // Allocate output buffer (host-visible, GPU writes garbage, never read back) + // Allocate output buffer (device-local VRAM, GPU writes, never read back) core::smart_refctd_ptr outputBuf; { video::IGPUBuffer::SCreationParams bparams = {}; @@ -128,12 +122,29 @@ class CSamplerBenchmark bparams.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT; outputBuf = m_device->createBuffer(std::move(bparams)); video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits(); + reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits(); m_outputAlloc = m_device->allocate(reqs, outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_NONE); if (!m_outputAlloc.isValid()) m_logger->log("CSamplerBenchmark: failed to allocate output buffer memory", system::ILogger::ELL_ERROR); } + // Zero-fill the input buffer once on the GPU + { + core::smart_refctd_ptr initCmdbuf; + m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &initCmdbuf); + initCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + const asset::SBufferRange range = { .offset = 0, .size = data.inputBufferBytes, .buffer = inputBuf }; + initCmdbuf->fillBuffer(range, 0u); + initCmdbuf->end(); + + auto queue = m_device->getQueue(data.computeFamilyIndex, 0); + const video::IQueue::SSubmitInfo::SCommandBufferInfo cmds[] = { {.cmdbuf = initCmdbuf.get()} }; + video::IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = cmds; + queue->submit({&submit, 1u}); + m_device->waitIdle(); + } + // Descriptor set: bind both buffers auto pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, { &dsLayout.get(), 1 }); m_ds = pool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); @@ -161,43 +172,36 @@ class CSamplerBenchmark m_logger->log("%s Sampler Benchmark Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, name.c_str(), m_executableReport.c_str()); } - // Runs warmupIterations submits (unclocked), then benchmarkIterations submits under GPU timestamps. void run(const std::string& samplerName, const std::string& mode, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) { m_device->waitIdle(); - recordBenchmarkCmdBuf(); - recordTimestampCmdBufs(); - - auto semaphore = m_device->createSemaphore(0u); - uint64_t semCounter = 0u; - const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = { {.cmdbuf = m_benchmarkCmdbuf.get()} }; - const video::IQueue::SSubmitInfo::SCommandBufferInfo beforeCmds[] = { {.cmdbuf = m_timestampBeforeCmdbuf.get()} }; - const video::IQueue::SSubmitInfo::SCommandBufferInfo afterCmds[] = { {.cmdbuf = m_timestampAfterCmdbuf.get()} }; - - // Chains submissions via a timeline semaphore so they execute strictly in order - auto submitSerial = [&](const video::IQueue::SSubmitInfo::SCommandBufferInfo* cmds, uint32_t count) - { - const video::IQueue::SSubmitInfo::SSemaphoreInfo waitSem[] = { - {.semaphore = semaphore.get(), .value = semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} - }; - const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { - {.semaphore = semaphore.get(), .value = ++semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} - }; - video::IQueue::SSubmitInfo submit = {}; - submit.commandBuffers = {cmds, count}; - submit.waitSemaphores = waitSem; - submit.signalSemaphores = signalSem; - m_queue->submit({&submit, 1u}); - }; + const uint32_t cooldownIterations = warmupIterations; + m_benchmarkCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); + m_benchmarkCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_benchmarkCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); + m_benchmarkCmdbuf->bindComputePipeline(m_pipeline.get()); + m_benchmarkCmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); for (uint32_t i = 0u; i < warmupIterations; ++i) - submitSerial(benchCmds, 1u); - - submitSerial(beforeCmds, 1u); + m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); + m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); for (uint32_t i = 0u; i < benchmarkIterations; ++i) - submitSerial(benchCmds, 1u); - submitSerial(afterCmds, 1u); + m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); + m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + for (uint32_t i = 0u; i < cooldownIterations; ++i) + m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); + m_benchmarkCmdbuf->end(); + + auto semaphore = m_device->createSemaphore(0u); + const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = { {.cmdbuf = m_benchmarkCmdbuf.get()} }; + const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { + {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} + }; + video::IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = benchCmds; + submit.signalSemaphores = signalSem; + m_queue->submit({&submit, 1u}); m_device->waitIdle(); @@ -219,36 +223,10 @@ class CSamplerBenchmark } private: - void recordBenchmarkCmdBuf() - { - m_benchmarkCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_benchmarkCmdbuf->begin(video::IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); - m_benchmarkCmdbuf->bindComputePipeline(m_pipeline.get()); - m_benchmarkCmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); - m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchmarkCmdbuf->end(); - } - - void recordTimestampCmdBufs() - { - m_timestampBeforeCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampBeforeCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampBeforeCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); - m_timestampBeforeCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0); - m_timestampBeforeCmdbuf->end(); - - m_timestampAfterCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampAfterCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampAfterCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1); - m_timestampAfterCmdbuf->end(); - } - core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_logger; core::smart_refctd_ptr m_cmdpool; core::smart_refctd_ptr m_benchmarkCmdbuf; - core::smart_refctd_ptr m_timestampBeforeCmdbuf; - core::smart_refctd_ptr m_timestampAfterCmdbuf; core::smart_refctd_ptr m_queryPool; core::smart_refctd_ptr m_pplnLayout; core::smart_refctd_ptr m_pipeline; diff --git a/37_HLSLSamplingTests/main.cpp b/37_HLSLSamplingTests/main.cpp index 470132aba..e0248d034 100644 --- a/37_HLSLSamplingTests/main.cpp +++ b/37_HLSLSamplingTests/main.cpp @@ -1,5 +1,7 @@ #include +#include + #include "nbl/examples/examples.hpp" #include "nbl/this_example/builtin/build/spirv/keys.hpp" @@ -109,12 +111,19 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // Note: all samplers almost satisfy BasicSampler, but they have cache parameters in generate(). static_assert(sampling::concepts::BasicSampler>); static_assert(sampling::concepts::BasicSampler>); - static_assert(sampling::concepts::BasicSampler); - static_assert(sampling::concepts::BasicSampler); + static_assert(sampling::concepts::BasicSampler, sampling::TRACKING>>); + static_assert(sampling::concepts::BasicSampler, sampling::YOLO>>); + static_assert(sampling::concepts::BasicSampler, sampling::EYTZINGER>>); + static_assert(sampling::concepts::BasicSampler, ReadOnlyAccessor, 26>>); + static_assert(sampling::concepts::BasicSampler, 4>, ReadOnlyAccessor, 26>>); // --- TractableSampler (level 2) --- generate(domain_type, out cache_type) -> codomain_type, forwardPdf(domain_type, cache_type) -> density_type - static_assert(sampling::concepts::TractableSampler); - static_assert(sampling::concepts::TractableSampler); + ; + static_assert(sampling::concepts::TractableSampler, sampling::TRACKING>>); + static_assert(sampling::concepts::TractableSampler, sampling::YOLO>>); + static_assert(sampling::concepts::TractableSampler, sampling::EYTZINGER>>); + static_assert(sampling::concepts::TractableSampler, ReadOnlyAccessor, 26>>); + static_assert(sampling::concepts::TractableSampler, 4>, ReadOnlyAccessor, 26>>); static_assert(sampling::concepts::TractableSampler>); static_assert(sampling::concepts::TractableSampler>); static_assert(sampling::concepts::TractableSampler>); @@ -130,8 +139,11 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat static_assert(sampling::concepts::TractableSampler>); // --- ResamplableSampler (level 3, parallel) --- generate(domain_type, out cache_type) -> codomain_type, forwardWeight(domain_type, cache_type), backwardWeight(codomain_type) - static_assert(sampling::concepts::ResamplableSampler); - static_assert(sampling::concepts::ResamplableSampler); + static_assert(sampling::concepts::ResamplableSampler, sampling::TRACKING>>); + static_assert(sampling::concepts::ResamplableSampler, sampling::YOLO>>); + static_assert(sampling::concepts::ResamplableSampler, sampling::EYTZINGER>>); + static_assert(sampling::concepts::ResamplableSampler, ReadOnlyAccessor, 26>>); + static_assert(sampling::concepts::ResamplableSampler, 4>, ReadOnlyAccessor, 26>>); static_assert(sampling::concepts::ResamplableSampler>); static_assert(sampling::concepts::ResamplableSampler>); static_assert(sampling::concepts::ResamplableSampler>); @@ -179,11 +191,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // ====================================================================== // GPU throughput benchmarks // ====================================================================== - // 4096 workgroups * WORKGROUP_SIZE(64) = 256k invocations per dispatch — enough - // to saturate a 3080 (68 SMs * ~1536 resident invocations) so memory latency is - // hidden by hyperthreading rather than by cross-dispatch overlap. constexpr uint32_t testBatchCount = 4096; - constexpr bool DoBenchmark = true; + constexpr bool DoBenchmark = true; if constexpr (DoBenchmark) { @@ -195,8 +204,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat struct BenchEntry { CSamplerBenchmark bench; - std::string sampler; - std::string mode; + std::string sampler; + std::string mode; }; std::vector benchmarks; @@ -222,44 +231,47 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat }; // Bench shaders don't read input (hardcoded values) and write a single uint32_t per thread via RWByteAddressBuffer - constexpr size_t benchInputBytes = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks - constexpr size_t benchOutputBytes = sizeof(uint32_t) * totalThreadsPerDispatch; - //addBench("Linear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("Linear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("Bilinear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("Bilinear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("BoxMullerTransform", "1:1", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("BoxMullerTransform", "1:16", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("UniformHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("UniformHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("UniformSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("UniformSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ConcentricMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ConcentricMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("PolarMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("PolarMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "1:1 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "1:16 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "1:1 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "1:16 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "1:1 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "1:16 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "create-only (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "create-only (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "create-only (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphericalRectangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphericalRectangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphericalRectangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); + if constexpr (true) + { + constexpr size_t benchInputBytes = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks + constexpr size_t benchOutputBytes = sizeof(uint32_t) * totalThreadsPerDispatch; + addBench("Linear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("Linear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("Bilinear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("Bilinear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("BoxMullerTransform", "1:1", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("BoxMullerTransform", "1:16", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("UniformHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("UniformHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("UniformSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("UniformSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ConcentricMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ConcentricMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("PolarMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("PolarMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "1:1 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "1:16 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "1:1 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "1:16 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "1:1 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "1:16 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "create-only (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "create-only (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "create-only (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphericalRectangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphericalRectangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphericalRectangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); + } // Print all pipeline reports first for (auto& entry : benchmarks) @@ -268,16 +280,18 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // Discrete sampler benchmark: alias table vs cumulative probability (BDA) { CDiscreteSamplerBenchmark::SetupData dsData; - dsData.device = m_device; - dsData.api = m_api; - dsData.assetMgr = m_assetMgr; - dsData.logger = m_logger; - dsData.physicalDevice = m_physicalDevice; - dsData.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - dsData.aliasShaderKey = nbl::this_example::builtin::build::get_spirv_key<"alias_table_bench">(m_device.get()); - dsData.cumProbShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get()); - dsData.cumProbYoloShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get()); - dsData.dispatchGroupCount = testBatchCount; + dsData.device = m_device; + dsData.api = m_api; + dsData.assetMgr = m_assetMgr; + dsData.logger = m_logger; + dsData.physicalDevice = m_physicalDevice; + dsData.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); + dsData.packedAliasAShaderKey = nbl::this_example::builtin::build::get_spirv_key<"packed_alias_a_bench">(m_device.get()); + dsData.packedAliasBShaderKey = nbl::this_example::builtin::build::get_spirv_key<"packed_alias_b_bench">(m_device.get()); + dsData.cumProbShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get()); + dsData.cumProbYoloShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get()); + dsData.cumProbEytzingerShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_eytzinger_bench">(m_device.get()); + dsData.dispatchGroupCount = testBatchCount; CDiscreteSamplerBenchmark discreteBench; discreteBench.setup(dsData); @@ -295,41 +309,26 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat } { - // Sweep covers both the YOLO-vs-Comparator comparison (explicit points at - // N=100, 10k, 1M for wg=WORKGROUP_SIZE) and an alias-vs-CDF ramp from - // N=4 up to 32M in a roughly-power-of-8 progression. + // If you change something here, better change kBenchTable below too const std::vector discreteSizes = { - 4u, - 16u, - 32u, - 100u, - 128u, - 512u, - 8192u, - 10000u, - 131072u, - 1000000u, - 2097152u, - 16777216u, - 33554432u, - }; + 2u, 4u, 8u, 16u, 32u, 64u, 100u, 128u, 256u, 400u, 512u, 1024u, 2048u, 2049u, 3000u, 4096u, 7000u, 8192u, 10'000u, 16'384u, 32'768u, + 65'536u, 131'072u, 262'144u, 524'288u, 1'000'000u, 1'048'576u, 2'097'152u, 16'777'216u, 20'971'520u, 25'165'824u, 33'554'432u}; - // Adaptive dispatch scheduler: pick dispatch counts so total wall-clock - // per sampler-per-N stays near 1.5 s. Cost model comes from the prior - // sweep (order-of-magnitude ps/sample vs N). + // Per-N dispatch counts calibrated from a prior measured run auto dispatchScheduler = [](uint32_t N) -> CDiscreteSamplerBenchmark::DispatchCounts { - double ps_per_sample; - if (N < 1000u) ps_per_sample = 15.0; // L1-resident - else if (N < 100000u) ps_per_sample = 100.0; // L1/L2 - else if (N < 2000000u) ps_per_sample = 1000.0; // L2-edge - else ps_per_sample = 8000.0; // DRAM-bound - - constexpr double targetNs = 1.5e9; // ~1.5 s per bench - constexpr uint64_t samplesPerDispatch = uint64_t(WORKGROUP_SIZE) * uint64_t(testBatchCount) * uint64_t(BENCH_ITERS); - const uint64_t targetSamples = uint64_t((targetNs * 1000.0) / ps_per_sample); - const uint32_t bench = std::max(10u, uint32_t(targetSamples / samplesPerDispatch)); - const uint32_t warmup = std::max(20u, bench / 10u); + static constexpr std::pair kBenchTable[] = { + {2u, 7180u}, {4u, 5993u}, {8u, 4490u}, {16u, 4099u}, {32u, 3110u}, {64u, 3026u}, {100u, 2507u}, {128u, 2498u}, {256u, 2477u}, {400u, 2001u}, + {512u, 1827u}, {1024u, 1372u}, {2048u, 1010u}, {2049u, 1010u}, {3000u, 859u}, {4096u, 962u}, {7000u, 742u}, {8192u, 833u}, {10'000u, 590u}, {16'384u, 786u}, {32'768u, 608u}, + {65'536u, 283u}, {131'072u, 174u}, {262'144u, 160u}, {524'288u, 133u}, {1'000'000u, 77u}, {1'048'576u, 128u}, {2'097'152u, 106u}, {16'777'216u, 17u}, {20'971'520u, 17u}, {25'165'824u, 16u}, {33'554'432u, 14u}}; + uint32_t bench = 10u; // fallback for any N not in the table + for (const auto& e : kBenchTable) + if (e.first == N) + { + bench = e.second; + break; + } + const uint32_t warmup = std::max(5u, bench / 10u); return {warmup, bench}; }; @@ -354,8 +353,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat data.logger = m_logger; data.physicalDevice = m_physicalDevice; data.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - data.shaderKey = spirvKey; - Tester tester(testBatchCount, WORKGROUP_SIZE); + data.shaderKey = std::move(spirvKey); + Tester tester(testBatchCount); tester.setupPipeline(data); pass &= tester.performTestsAndVerifyResults(logFile); }; @@ -388,7 +387,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat } // --- GPU table sampler tests --- - runSamplerTest.operator()("AliasTable GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"alias_table_test">(m_device.get()), "AliasTableTestLog.txt"); + runSamplerTest.operator()("PackedAliasA GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"packed_alias_a_test">(m_device.get()), "PackedAliasATestLog.txt"); + runSamplerTest.operator()("PackedAliasB GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"packed_alias_b_test">(m_device.get()), "PackedAliasBTestLog.txt"); runSamplerTest.operator()("CumulativeProbability GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_test">(m_device.get()), "CumulativeProbabilityTestLog.txt"); } logJacobianSkipCounts(m_logger.get()); @@ -470,6 +470,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // ================================================================ // Solid angle accuracy and small triangle convergence tests (CPU-only) // ================================================================ + if constexpr (true) { m_logger->log("Running geometry tests (CPU)...", ILogger::ELL_INFO); m_logger->log("WARNING: CPU math may use higher intermediate precision than GPU shaders. Tolerances that pass here may be too tight for GPU.", ILogger::ELL_WARNING); diff --git a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h index 32f0e3b28..7665ebbb7 100644 --- a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h +++ b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h @@ -6,13 +6,31 @@ #include "nbl/examples/Tester/ITester.h" #include "SamplerTestHelpers.h" -class CAliasTableGPUTester final : public ITester +// Shared GPU correctness harness for the packed alias variants. Labels for +// failed-field messages are selected from the Executor type at compile time. +template +class CPackedAliasTableGPUTester final : public ITester { - using base_t = ITester; - using R = AliasTableTestResults; + using base_t = ITester; + using R = AliasTableTestResults; + + using typename base_t::TestType; + using base_t::getRandomEngine; + using base_t::verifyTestValue; + using base_t::printTestFail; + + static constexpr bool kIsA = std::is_same_v; + static constexpr const char* kGeneratedIdxName = kIsA ? "PackedAliasA::generatedIndex" : "PackedAliasB::generatedIndex"; + static constexpr const char* kForwardPdfName = kIsA ? "PackedAliasA::forwardPdf" : "PackedAliasB::forwardPdf"; + static constexpr const char* kBackwardPdfName = kIsA ? "PackedAliasA::backwardPdf" : "PackedAliasB::backwardPdf"; + static constexpr const char* kForwardWeightName = kIsA ? "PackedAliasA::forwardWeight" : "PackedAliasB::forwardWeight"; + static constexpr const char* kBackwardWeightName = kIsA ? "PackedAliasA::backwardWeight" : "PackedAliasB::backwardWeight"; + static constexpr const char* kJacobianName = kIsA ? "PackedAliasA::jacobianProduct" : "PackedAliasB::jacobianProduct"; + static constexpr const char* kPdfConsistencyName = kIsA ? "PackedAliasA::pdf consistency" : "PackedAliasB::pdf consistency"; + static constexpr const char* kWeightConsistencyName = kIsA ? "PackedAliasA::weight consistency" : "PackedAliasB::weight consistency"; public: - CAliasTableGPUTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {} + CPackedAliasTableGPUTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {} private: AliasTableInputValues generateInputTestValues() override @@ -27,7 +45,7 @@ class CAliasTableGPUTester final : public ITester; +using CPackedAliasBGPUTester = CPackedAliasTableGPUTester; + #endif diff --git a/37_HLSLSamplingTests/tests/CBilinearTester.h b/37_HLSLSamplingTests/tests/CBilinearTester.h index 739af4584..f5bea6896 100644 --- a/37_HLSLSamplingTests/tests/CBilinearTester.h +++ b/37_HLSLSamplingTests/tests/CBilinearTester.h @@ -14,7 +14,7 @@ class CBilinearTester final : public ITester #include #include +#include // Generic ReadOnly accessor wrapping a raw pointer template + requires std::is_arithmetic_v struct ReadOnlyAccessor { - using value_type = T; - template requires std::is_arithmetic_v - void get(I i, V& val) const { val = V(data[i]); } - T operator[](uint32_t i) const { return data[i]; } + using value_type = T; + template + requires std::is_arithmetic_v + void get(I i, V& val) const { val = V(data[i]); } - const T* data; + const T* data; }; -using ProbabilityAccessor = ReadOnlyAccessor; -using AliasIndexAccessor = ReadOnlyAccessor; -using PdfAccessor = ReadOnlyAccessor; - -using TestAliasTable = nbl::hlsl::sampling::AliasTable; -using TestCumulativeProbabilitySampler = nbl::hlsl::sampling::CumulativeProbabilitySampler>; - // Tests table construction for both alias method and cumulative probability. // Sampler generate/pdf correctness is verified by GPU testers (CAliasTableGPUTester, CCumulativeProbabilityGPUTester). class CDiscreteTableTester { -public: - CDiscreteTableTester(system::ILogger* logger) : m_logger(logger) {} - - bool run() - { - bool pass = true; - auto cases = createTestCases(); - - m_logger->log("AliasTableBuilder tests:", system::ILogger::ELL_INFO); - for (const auto& tc : cases) - pass &= testAliasTable(tc.name, tc.weights); - - m_logger->log("CumulativeProbability tests:", system::ILogger::ELL_INFO); - for (const auto& tc : cases) - pass &= testCumulativeProbability(tc.name, tc.weights); - - return pass; - } - -private: - struct TestCase - { - const char* name; - std::vector weights; - }; - - static std::vector createTestCases() - { - std::vector cases; - cases.push_back({"Uniform(4)", {1.0f, 1.0f, 1.0f, 1.0f}}); - cases.push_back({"NonUniform(1,2,3,4)", {1.0f, 2.0f, 3.0f, 4.0f}}); - - { - std::vector w(32, 1.0f); - w[31] = 97.0f; - cases.push_back({"SingleDominant(32)", std::move(w)}); - } - { - std::vector w(64); - for (uint32_t i = 0; i < 64; i++) - w[i] = 1.0f / float(i + 1); - cases.push_back({"PowerLaw(64)", std::move(w)}); - } - - cases.push_back({"SingleNonZero(4)", {0.0f, 0.0f, 5.0f, 0.0f}}); - - { - std::vector w(1024); - std::mt19937 rng(42); - std::uniform_real_distribution dist(0.001f, 100.0f); - for (uint32_t i = 0; i < 1024; i++) - w[i] = dist(rng); - cases.push_back({"Random(1024)", std::move(w)}); - } - - return cases; - } - - // Verify all values in array are in [0, 1] - bool verifyRange01(const char* prefix, const char* name, const char* arrayName, const float* data, uint32_t count) const - { - bool pass = true; - for (uint32_t i = 0; i < count; i++) - { - if (data[i] < 0.0f || data[i] > 1.0f + 1e-6f) - { - m_logger->log("%s[%s] %s[%u] = %f out of range [0, 1]", - system::ILogger::ELL_ERROR, prefix, name, arrayName, i, data[i]); - pass = false; - } - } - return pass; - } - - // Shared: verify PDFs sum to 1 and each matches weight/totalWeight - bool verifyPdf(const char* prefix, const char* name, const float* pdf, const std::vector& weights) const - { - const uint32_t N = static_cast(weights.size()); - float totalWeight = 0.0f; - for (uint32_t i = 0; i < N; i++) - totalWeight += weights[i]; - - bool pass = true; - - float pdfSum = 0.0f; - for (uint32_t i = 0; i < N; i++) - pdfSum += pdf[i]; - - if (std::abs(pdfSum - 1.0f) > 1e-5f) - { - m_logger->log("%s[%s] PDF sum: expected 1.0, got %f", system::ILogger::ELL_ERROR, prefix, name, pdfSum); - pass = false; - } - - for (uint32_t i = 0; i < N; i++) - { - const float expected = weights[i] / totalWeight; - const float err = std::abs(expected - pdf[i]); - if (err > 1e-6f) - { - m_logger->log("%s[%s] pdf[%u]: expected %f, got %f (err=%e)", system::ILogger::ELL_ERROR, prefix, name, i, expected, pdf[i], err); - pass = false; - } - } - - return pass; - } - - // Verify alias table builder output: - // - bucket contributions reconstruct correct probabilities - // - PDFs sum to 1 and match weight/totalWeight - // - alias indices in range, probabilities in [0, 1] - bool testAliasTable(const char* name, const std::vector& weights) const - { - const uint32_t N = static_cast(weights.size()); - - std::vector outProbability(N); - std::vector outAlias(N); - std::vector outPdf(N); - std::vector workspace(N); - - nbl::hlsl::sampling::AliasTableBuilder::build({ weights },outProbability.data(), outAlias.data(), outPdf.data(), workspace.data()); - - // Accumulate bucket contributions - std::vector dest(N, 0.0f); - for (uint32_t i = 0; i < N; i++) - { - dest[i] += outProbability[i]; - dest[outAlias[i]] += (1.0f - outProbability[i]); - } - - bool pass = true; - - float totalWeight = 0.0f; - for (uint32_t i = 0; i < N; i++) - totalWeight += weights[i]; - - for (uint32_t i = 0; i < N; i++) - { - const float expected = weights[i] / totalWeight * float(N); - const float err = std::abs(expected - dest[i]); - const float tolerance = std::max(1e-5f * float(N), 1e-4f); - - if (err > tolerance) - { - m_logger->log("AliasTable[%s] bucket %u: expected %f, got %f (err=%e)", - system::ILogger::ELL_ERROR, name, i, expected, dest[i], err); - pass = false; - } - } - - // Alias indices in range - for (uint32_t i = 0; i < N; i++) - { - if (outAlias[i] >= N) - { - m_logger->log("AliasTable[%s] alias[%u] = %u out of range [0, %u)", - system::ILogger::ELL_ERROR, name, i, outAlias[i], N); - pass = false; - } - } - - pass &= verifyPdf("AliasTable", name, outPdf.data(), weights); - pass &= verifyRange01("AliasTable", name, "probability", outProbability.data(), N); - - if (pass) - m_logger->log(" [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name); - - return pass; - } - - // Verify CDF table construction: - // - cumulative probabilities are monotonically non-decreasing - // - PDFs match weight/totalWeight - // - PDFs sum to 1 - bool testCumulativeProbability(const char* name, const std::vector& weights) const - { - const uint32_t N = static_cast(weights.size()); - - std::vector cumProb(N - 1); - - nbl::hlsl::sampling::computeNormalizedCumulativeHistogram( - std::span(weights), - cumProb.data()); - - bool pass = true; - - // Monotonically non-decreasing - for (uint32_t i = 1; i < N - 1; i++) - { - if (cumProb[i] < cumProb[i - 1] - 1e-7f) - { - m_logger->log("CumProb[%s] non-monotonic at %u: cumProb[%u]=%f < cumProb[%u]=%f", - system::ILogger::ELL_ERROR, name, i, i, cumProb[i], i - 1, cumProb[i - 1]); - pass = false; - } - } - - // Last stored entry should be < 1.0 (the Nth bucket is implicitly 1.0) - if (N > 1 && cumProb[N - 2] >= 1.0f + 1e-6f) - { - m_logger->log("CumProb[%s] last stored entry %f >= 1.0", - system::ILogger::ELL_ERROR, name, cumProb[N - 2]); - pass = false; - } - - // Derive PDF from CDF for verification - std::vector pdf(N); - for (uint32_t i = 0; i < N; i++) - { - const float cur = (i < N - 1) ? cumProb[i] : 1.0f; - const float prev = (i > 0) ? cumProb[i - 1] : 0.0f; - pdf[i] = cur - prev; - } - - pass &= verifyPdf("CumProb", name, pdf.data(), weights); - pass &= verifyRange01("CumProb", name, "cumProb", cumProb.data(), N - 1); - - if (pass) - m_logger->log(" [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name); - - return pass; - } - - system::ILogger* m_logger; + public: + CDiscreteTableTester(system::ILogger* logger) : m_logger(logger) {} + + bool run() + { + bool pass = true; + auto cases = createTestCases(); + + m_logger->log("AliasTableBuilder tests:", system::ILogger::ELL_INFO); + for (const auto& tc : cases) + pass &= testAliasTable(tc.name, tc.weights); + + m_logger->log("CumulativeProbability tests:", system::ILogger::ELL_INFO); + for (const auto& tc : cases) + pass &= testCumulativeProbability(tc.name, tc.weights); + + m_logger->log("CumulativeProbabilitySampler tests (TRACKING / YOLO / EYTZINGER):", system::ILogger::ELL_INFO); + for (const auto& tc : cases) + pass &= testSamplers(tc.name, tc.weights); + + return pass; + } + + private: + struct TestCase + { + const char* name; + std::vector weights; + }; + + static std::vector createTestCases() + { + std::vector cases; + cases.push_back({"Uniform(4)", {1.0f, 1.0f, 1.0f, 1.0f}}); + cases.push_back({"NonUniform(1,2,3,4)", {1.0f, 2.0f, 3.0f, 4.0f}}); + + { + std::vector w(32, 1.0f); + w[31] = 97.0f; + cases.push_back({"SingleDominant(32)", std::move(w)}); + } + { + std::vector w(64); + for (uint32_t i = 0; i < 64; i++) + w[i] = 1.0f / float(i + 1); + cases.push_back({"PowerLaw(64)", std::move(w)}); + } + + cases.push_back({"SingleNonZero(4)", {0.0f, 0.0f, 5.0f, 0.0f}}); + + { + std::vector w(1024); + std::mt19937 rng(42); + std::uniform_real_distribution dist(0.001f, 100.0f); + for (uint32_t i = 0; i < 1024; i++) + w[i] = dist(rng); + cases.push_back({"Random(1024)", std::move(w)}); + } + + // NPoT cases exercise EYTZINGER padded-leaf territory (P > N). + cases.push_back({"NonPot(7)", {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}}); + { + std::vector w(1000); + std::mt19937 rng(4242); + std::uniform_real_distribution dist(0.001f, 100.0f); + for (uint32_t i = 0; i < 1000; i++) + w[i] = dist(rng); + cases.push_back({"Random(1000)", std::move(w)}); + } + + return cases; + } + + // Verify all values in array are in [0, 1] + bool verifyRange01(const char* prefix, const char* name, const char* arrayName, const float* data, uint32_t count) const + { + bool pass = true; + for (uint32_t i = 0; i < count; i++) + { + if (data[i] < 0.0f || data[i] > 1.0f + 1e-6f) + { + m_logger->log("%s[%s] %s[%u] = %f out of range [0, 1]", + system::ILogger::ELL_ERROR, prefix, name, arrayName, i, data[i]); + pass = false; + } + } + return pass; + } + + // Shared: verify PDFs sum to 1 and each matches weight/totalWeight + bool verifyPdf(const char* prefix, const char* name, const float* pdf, const std::vector& weights) const + { + const uint32_t N = static_cast(weights.size()); + float totalWeight = 0.0f; + for (uint32_t i = 0; i < N; i++) + totalWeight += weights[i]; + + bool pass = true; + + float pdfSum = 0.0f; + for (uint32_t i = 0; i < N; i++) + pdfSum += pdf[i]; + + if (std::abs(pdfSum - 1.0f) > 1e-5f) + { + m_logger->log("%s[%s] PDF sum: expected 1.0, got %f", system::ILogger::ELL_ERROR, prefix, name, pdfSum); + pass = false; + } + + for (uint32_t i = 0; i < N; i++) + { + const float expected = weights[i] / totalWeight; + const float err = std::abs(expected - pdf[i]); + if (err > 1e-6f) + { + m_logger->log("%s[%s] pdf[%u]: expected %f, got %f (err=%e)", system::ILogger::ELL_ERROR, prefix, name, i, expected, pdf[i], err); + pass = false; + } + } + + return pass; + } + + // Verify alias table builder output: + // - bucket contributions reconstruct correct scaled probabilities + // - PDFs sum to 1 and match weight/totalWeight + // - alias indices in range, probabilities in [0, 1] + // Builder transparently pads PoT N to N+1; actual table size comes back + // as `tableN` and is what gets compared against. + bool testAliasTable(const char* name, const std::vector& weights) const + { + const uint32_t userN = static_cast(weights.size()); + + std::vector outProbability; + std::vector outAlias; + std::vector outPdf; + const uint32_t tableN = nbl::hlsl::sampling::AliasTableBuilder::build({weights}, outProbability, outAlias, outPdf); + + // Accumulate bucket contributions over the full (possibly padded) table + std::vector dest(tableN, 0.0f); + for (uint32_t i = 0; i < tableN; i++) + { + dest[i] += outProbability[i]; + dest[outAlias[i]] += (1.0f - outProbability[i]); + } + + bool pass = true; + + float totalWeight = 0.0f; + for (uint32_t i = 0; i < userN; i++) + totalWeight += weights[i]; + + // Real buckets: expected scaled prob = weight/total * tableN + for (uint32_t i = 0; i < userN; i++) + { + const float expected = weights[i] / totalWeight * float(tableN); + const float err = std::abs(expected - dest[i]); + const float tolerance = std::max(1e-5f * float(tableN), 1e-4f); + + if (err > tolerance) + { + m_logger->log("AliasTable[%s] bucket %u: expected %f, got %f (err=%e)", + system::ILogger::ELL_ERROR, name, i, expected, dest[i], err); + pass = false; + } + } + + // Dummy bucket (only when padded): no real bucket aliases to it -> dest[userN] should be 0. + if (tableN != userN && std::abs(dest[userN]) > 1e-4f) + { + m_logger->log("AliasTable[%s] dummy bucket %u has non-zero reconstructed probability %f", + system::ILogger::ELL_ERROR, name, userN, dest[userN]); + pass = false; + } + + // Alias indices in range [0, tableN) + for (uint32_t i = 0; i < tableN; i++) + { + if (outAlias[i] >= tableN) + { + m_logger->log("AliasTable[%s] alias[%u] = %u out of range [0, %u)", + system::ILogger::ELL_ERROR, name, i, outAlias[i], tableN); + pass = false; + } + } + + pass &= verifyPdf("AliasTable", name, outPdf.data(), weights); + pass &= verifyRange01("AliasTable", name, "probability", outProbability.data(), tableN); + + if (pass) + m_logger->log(" [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name); + + return pass; + } + + // Verify CDF table construction: monotonicity, implicit-1.0 invariant, and + // stored entries in [0, 1]. PDF-from-CDF correctness is covered by the + // TRACKING sampler test below (same cdf[i] - cdf[i-1] derivation via + // sampler.backwardPdf), so it's not repeated here. + bool testCumulativeProbability(const char* name, const std::vector& weights) const + { + const uint32_t N = static_cast(weights.size()); + + std::vector cumProb(N - 1); + + nbl::hlsl::sampling::computeNormalizedCumulativeHistogram(std::span(weights), cumProb.data()); + + bool pass = true; + + // Monotonically non-decreasing + for (uint32_t i = 1; i < N - 1; i++) + { + if (cumProb[i] < cumProb[i - 1] - 1e-7f) + { + m_logger->log("CumProb[%s] non-monotonic at %u: cumProb[%u]=%f < cumProb[%u]=%f", + system::ILogger::ELL_ERROR, name, i, i, cumProb[i], i - 1, cumProb[i - 1]); + pass = false; + } + } + + // Last stored entry should be < 1.0 (the Nth bucket is implicitly 1.0) + if (N > 1 && cumProb[N - 2] >= 1.0f + 1e-6f) + { + m_logger->log("CumProb[%s] last stored entry %f >= 1.0", system::ILogger::ELL_ERROR, name, cumProb[N - 2]); + pass = false; + } + + pass &= verifyRange01("CumProb", name, "cumProb", cumProb.data(), N - 1); + + if (pass) + m_logger->log(" [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name); + + return pass; + } + + // Reference binary search over the full N-entry CDF (last entry == 1.0). + static uint32_t referenceUpperBound(const std::vector& fullCdf, float u) + { + auto it = std::upper_bound(fullCdf.begin(), fullCdf.end(), u); + return static_cast(std::distance(fullCdf.begin(), it)); + } + + // Run TRACKING, YOLO, and EYTZINGER samplers against the same reference + // distribution. Each mode is instantiated via the dual-compile sampler and + // exercised entirely on the CPU. + bool testSamplers(const char* name, const std::vector& weights) const + { + const uint32_t N = static_cast(weights.size()); + if (N < 2) + return true; + + float totalWeight = 0.0f; + for (uint32_t i = 0; i < N; i++) + totalWeight += weights[i]; + const float rcpTotal = 1.0f / totalWeight; + + std::vector pdfRef(N); + std::vector fullCdf(N); + float acc = 0.0f; + for (uint32_t i = 0; i < N; i++) + { + pdfRef[i] = weights[i] * rcpTotal; + acc += pdfRef[i]; + fullCdf[i] = acc; + } + fullCdf[N - 1] = 1.0f; // pin the last entry; reference must treat it as exact + + // Storage for TRACKING / YOLO (N-1 entries, last bucket implicit at 1.0). + std::vector cdfStorage(N - 1); + nbl::hlsl::sampling::computeNormalizedCumulativeHistogram({weights}, cdfStorage.data()); + + // Storage for EYTZINGER (2*P entries, level-order implicit binary tree). + const uint32_t P = nbl::hlsl::sampling::eytzingerLeafCount(N); + std::vector treeStorage(2u * P, 0.0f); + nbl::hlsl::sampling::buildEytzinger({weights}, treeStorage.data()); + + bool pass = true; + pass &= testSamplerMode("TRACKING", name, N, pdfRef, fullCdf, cdfStorage.data()); + pass &= testSamplerMode("YOLO", name, N, pdfRef, fullCdf, cdfStorage.data()); + pass &= testSamplerMode("EYTZINGER", name, N, pdfRef, fullCdf, treeStorage.data()); + return pass; + } + + template + bool testSamplerMode(const char* modeName, const char* caseName, uint32_t N, + const std::vector& pdfRef, const std::vector& fullCdf, const float* accessorData) const + { + using Sampler = nbl::hlsl::sampling::CumulativeProbabilitySampler< + float, float, uint32_t, ReadOnlyAccessor, Mode>; + + ReadOnlyAccessor accessor {accessorData}; + Sampler sampler = Sampler::create(accessor, N); + + bool pass = true; + + // backwardPdf(v) == pdfRef[v], and the implied PDF sums to 1. + float backwardSum = 0.0f; + for (uint32_t v = 0; v < N; v++) + { + const float got = sampler.backwardPdf(v); + const float expected = pdfRef[v]; + const float err = std::abs(got - expected); + const float tol = 1e-5f; + if (err > tol) + { + m_logger->log("Sampler[%s][%s] backwardPdf[%u]: expected %e, got %e (err=%e)", + system::ILogger::ELL_ERROR, modeName, caseName, v, expected, got, err); + pass = false; + } + backwardSum += got; + } + if (std::abs(backwardSum - 1.0f) > 1e-5f) + { + m_logger->log("Sampler[%s][%s] backwardPdf sum: expected 1.0, got %f", + system::ILogger::ELL_ERROR, modeName, caseName, backwardSum); + pass = false; + } + + // generate(u) lands in the correct bucket for a grid of u values, and + // generate(u, cache) produces forwardPdf matching backwardPdf(result). + std::mt19937 rng(1234u + N); + std::uniform_real_distribution udist(0.0f, std::nextafter(1.0f, 0.0f)); + constexpr uint32_t kTrials = 2048; + + for (uint32_t k = 0; k < kTrials; k++) + { + const float u = udist(rng); + const uint32_t ref = referenceUpperBound(fullCdf, u); + + const uint32_t idx = sampler.generate(u); + if (idx != ref) + { + m_logger->log("Sampler[%s][%s] generate(%.7f): expected bucket %u, got %u", + system::ILogger::ELL_ERROR, modeName, caseName, u, ref, idx); + pass = false; + continue; + } + + typename Sampler::cache_type cache; + const uint32_t idxCache = sampler.generate(u, cache); + if (idxCache != ref) + { + m_logger->log("Sampler[%s][%s] generate(u,cache)(%.7f): expected %u, got %u", + system::ILogger::ELL_ERROR, modeName, caseName, u, ref, idxCache); + pass = false; + continue; + } + + const float forwardP = sampler.forwardPdf(u, cache); + const float backwardP = sampler.backwardPdf(idxCache); + if (std::abs(forwardP - backwardP) > 1e-6f) + { + m_logger->log("Sampler[%s][%s] fwd/bwd pdf mismatch at u=%.7f bucket=%u: fwd=%e bwd=%e", + system::ILogger::ELL_ERROR, modeName, caseName, u, idxCache, forwardP, backwardP); + pass = false; + } + } + + if (pass) + m_logger->log(" [%-9s %s] PASSED", system::ILogger::ELL_PERFORMANCE, modeName, caseName); + return pass; + } + + system::ILogger* m_logger; }; #endif diff --git a/37_HLSLSamplingTests/tests/CLinearTester.h b/37_HLSLSamplingTests/tests/CLinearTester.h index 814fbb1d7..394b68721 100644 --- a/37_HLSLSamplingTests/tests/CLinearTester.h +++ b/37_HLSLSamplingTests/tests/CLinearTester.h @@ -14,7 +14,7 @@ class CLinearTester final : public ITestergetFamilyIndex(); m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - //core::smart_refctd_ptr* cmdBuffs[] = { &m_cmdbuf, &m_timestampBeforeCmdBuff, &m_timestampAfterCmdBuff }; if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) base.logFail("Failed to create Command Buffers!\n"); - if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdBuff)) - base.logFail("Failed to create Command Buffers!\n"); - if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdBuff)) - base.logFail("Failed to create Command Buffers!\n"); // Load shaders, set up pipeline { @@ -1024,6 +1019,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso dummyBuff->setObjectDebugName("benchmark buffer"); nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = dummyBuff->getMemoryReqs(); + reqs.memoryTypeBits &= base.m_physicalDevice->getDeviceLocalMemoryTypeBits(); m_allocation = base.m_device->allocate(reqs, dummyBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); if (!m_allocation.isValid()) @@ -1075,104 +1071,51 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso { m_device->waitIdle(); - recordTimestampQueryCmdBuffers(); - - uint64_t semaphoreCounter = 0; - smart_refctd_ptr semaphore = m_device->createSemaphore(semaphoreCounter); - - IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} }; - IQueue::SSubmitInfo::SSemaphoreInfo waits[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT } }; - - IQueue::SSubmitInfo beforeTimestapSubmitInfo[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsBegin[] = { {.cmdbuf = m_timestampBeforeCmdBuff.get()} }; - beforeTimestapSubmitInfo[0].commandBuffers = cmdbufsBegin; - beforeTimestapSubmitInfo[0].signalSemaphores = signals; - beforeTimestapSubmitInfo[0].waitSemaphores = waits; - - IQueue::SSubmitInfo afterTimestapSubmitInfo[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsEnd[] = { {.cmdbuf = m_timestampAfterCmdBuff.get()} }; - afterTimestapSubmitInfo[0].commandBuffers = cmdbufsEnd; - afterTimestapSubmitInfo[0].signalSemaphores = signals; - afterTimestapSubmitInfo[0].waitSemaphores = waits; - - IQueue::SSubmitInfo benchmarkSubmitInfos[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} }; - benchmarkSubmitInfos[0].commandBuffers = cmdbufs; - benchmarkSubmitInfos[0].signalSemaphores = signals; - benchmarkSubmitInfos[0].waitSemaphores = waits; - - m_pushConstants.benchmarkMode = mode; - recordCmdBuff(); - // warmup runs - for (int i = 0; i < WarmupIterations; ++i) - { - if(i == 0) - m_api->startCapture(); - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(benchmarkSubmitInfos); - if (i == 0) - m_api->endCapture(); - } - - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(beforeTimestapSubmitInfo); - - // actual benchmark runs - for (int i = 0; i < Iterations; ++i) - { - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(benchmarkSubmitInfos); - } - - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(afterTimestapSubmitInfo); - - m_device->waitIdle(); + // [warmup dispatches][ts 0][bench dispatches][ts 1][cooldown dispatches] in one cmdbuf, + // one submit. Per-submit semaphore chaining adds sync cost and blocks driver pipelining; + // the cooldown keeps the GPU in steady state across ts 1 so the trailing bench + // dispatches don't land in a winding-down tail. + constexpr int CooldownIterations = WarmupIterations; - const uint64_t nativeBenchmarkTimeElapsedNanoseconds = calcTimeElapsed(); - const float nativeBenchmarkTimeElapsedSeconds = double(nativeBenchmarkTimeElapsedNanoseconds) / 1000000000.0; - - m_logger->log("%llu ns, %f s", ILogger::ELL_PERFORMANCE, nativeBenchmarkTimeElapsedNanoseconds, nativeBenchmarkTimeElapsedSeconds); - } - - void recordCmdBuff() - { - m_cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); + m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); m_cmdbuf->beginDebugMarker("emulated_float64_t compute dispatch", vectorSIMDf(0, 1, 0, 1)); + m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); m_cmdbuf->bindComputePipeline(m_pipeline.get()); m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants); - m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); + for (int i = 0; i < WarmupIterations; ++i) + m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); + m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); + for (int i = 0; i < Iterations; ++i) + m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); + m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + for (int i = 0; i < CooldownIterations; ++i) + m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); m_cmdbuf->endDebugMarker(); m_cmdbuf->end(); - } - void recordTimestampQueryCmdBuffers() - { - static bool firstInvocation = true; + smart_refctd_ptr semaphore = m_device->createSemaphore(0u); + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} }; + const IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { + {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} + }; + IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = cmdbufs; + submit.signalSemaphores = signalSem; - if (!firstInvocation) - { - m_timestampBeforeCmdBuff->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampBeforeCmdBuff->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); - } + m_api->startCapture(); + m_computeQueue->submit({&submit, 1u}); + m_api->endCapture(); - m_timestampBeforeCmdBuff->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampBeforeCmdBuff->resetQueryPool(m_queryPool.get(), 0, 2); - m_timestampBeforeCmdBuff->writeTimestamp(PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0); - m_timestampBeforeCmdBuff->end(); + m_device->waitIdle(); - m_timestampAfterCmdBuff->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampAfterCmdBuff->writeTimestamp(PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1); - m_timestampAfterCmdBuff->end(); + const uint64_t nativeBenchmarkTimeElapsedNanoseconds = calcTimeElapsed(); + const float nativeBenchmarkTimeElapsedSeconds = double(nativeBenchmarkTimeElapsedNanoseconds) / 1000000000.0; - firstInvocation = false; + m_logger->log("%llu ns, %f s", ILogger::ELL_PERFORMANCE, nativeBenchmarkTimeElapsedNanoseconds, nativeBenchmarkTimeElapsedSeconds); } uint64_t calcTimeElapsed() @@ -1196,8 +1139,6 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso BenchmarkPushConstants m_pushConstants; smart_refctd_ptr m_pipeline; - smart_refctd_ptr m_timestampBeforeCmdBuff = nullptr; - smart_refctd_ptr m_timestampAfterCmdBuff = nullptr; smart_refctd_ptr m_queryPool = nullptr; uint32_t m_queueFamily; From 23d6c4c83c2898e8d6fb8329fbd266cbb07ce144 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Wed, 13 May 2026 09:43:35 +0300 Subject: [PATCH 5/5] sampler bench BDA push constants + spherical rect tests - pipeline layout declares the PC range, output buf is BDA-allocated, PC pushed before the dispatch loop - spherical rectangle: new tests for generateNormalizedLocal / generateUnnormalized / computeHitT, bug in computeHitT - drop redundant pdf field in tests --- 37_HLSLSamplingTests/CMakeLists.txt | 1 + .../common/sampler_bench_pc.hlsl | 15 ++++++++++++ .../common/spherical_rectangle.hlsl | 18 ++++++++++++++ .../common/uniform_hemisphere.hlsl | 1 - .../app_resources/common/uniform_sphere.hlsl | 1 - .../shaders/bilinear_test.comp.hlsl | 5 ++-- .../box_muller_transform_test.comp.hlsl | 5 ++-- .../shaders/concentric_mapping_test.comp.hlsl | 5 ++-- .../shaders/linear_test.comp.hlsl | 5 ++-- .../shaders/polar_mapping_test.comp.hlsl | 5 ++-- .../projected_hemisphere_test.comp.hlsl | 5 ++-- .../shaders/projected_sphere_test.comp.hlsl | 5 ++-- ...ojected_spherical_rectangle_test.comp.hlsl | 5 ++-- ...rojected_spherical_triangle_test.comp.hlsl | 5 ++-- .../spherical_rectangle_test.comp.hlsl | 5 ++-- .../shaders/spherical_triangle.comp.hlsl | 5 ++-- .../shaders/uniform_hemisphere_test.comp.hlsl | 5 ++-- .../shaders/uniform_sphere_test.comp.hlsl | 5 ++-- .../benchmarks/CSamplerBenchmark.h | 20 ++++++++++++---- .../tests/CSphericalRectangleTester.h | 24 +++++++++++++++++++ .../tests/CUniformHemisphereTester.h | 1 - .../tests/CUniformSphereTester.h | 1 - 22 files changed, 113 insertions(+), 34 deletions(-) create mode 100644 37_HLSLSamplingTests/app_resources/common/sampler_bench_pc.hlsl diff --git a/37_HLSLSamplingTests/CMakeLists.txt b/37_HLSLSamplingTests/CMakeLists.txt index e50fe4663..78e3ab319 100644 --- a/37_HLSLSamplingTests/CMakeLists.txt +++ b/37_HLSLSamplingTests/CMakeLists.txt @@ -42,6 +42,7 @@ set(DEPENDS app_resources/common/concentric_mapping.hlsl app_resources/common/polar_mapping.hlsl app_resources/common/discrete_sampler_bench.hlsl + app_resources/common/sampler_bench_pc.hlsl app_resources/common/alias_table.hlsl app_resources/common/cumulative_probability.hlsl ) diff --git a/37_HLSLSamplingTests/app_resources/common/sampler_bench_pc.hlsl b/37_HLSLSamplingTests/app_resources/common/sampler_bench_pc.hlsl new file mode 100644 index 000000000..ab357e504 --- /dev/null +++ b/37_HLSLSamplingTests/app_resources/common/sampler_bench_pc.hlsl @@ -0,0 +1,15 @@ +#ifndef _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_SAMPLER_BENCH_PC_INCLUDED_ +#define _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_SAMPLER_BENCH_PC_INCLUDED_ + +#include + +// Implicit-output benchmark push constants. Every sampler bench shader writes +// one uint32_t accumulator per thread to outputAddress[invID]; nothing reads it +// back -- the goal is to keep the optimiser from eliding the sampling work. +// Mirrors the BDA convention from discrete_sampler_bench.hlsl. +struct SamplerBenchPushConstants +{ + uint64_t outputAddress; +}; + +#endif diff --git a/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl b/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl index 4f8d20964..68159405a 100644 --- a/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl @@ -22,6 +22,11 @@ struct SphericalRectangleTestResults float32_t3 generated; float32_t2 surfaceOffset; float32_t3 referenceDirection; + float32_t3 normalizedLocal; + float32_t hitDist; + float32_t3 unnormalized; + float32_t computedHitT; + float32_t3 normalizedLocalToWorld; float32_t forwardPdf; float32_t backwardPdf; float32_t forwardWeight; @@ -61,6 +66,19 @@ struct SphericalRectangleTestExecutor + sampler.basis[1] * localDir[1] + sampler.basis[2] * localDir[2]; } + { + sampling::SphericalRectangle::cache_type cache; + output.normalizedLocal = sampler.generateNormalizedLocal(input.u, cache, output.hitDist); + output.normalizedLocalToWorld = sampler.basis[0] * output.normalizedLocal[0] + + sampler.basis[1] * output.normalizedLocal[1] + + sampler.basis[2] * output.normalizedLocal[2]; + } + { + sampling::SphericalRectangle::cache_type cache; + output.unnormalized = sampler.generateUnnormalized(input.u, cache); + } + output.computedHitT = sampler.computeHitT(output.generated); + output.backwardPdf = sampler.backwardPdf(output.generated); output.backwardWeight = sampler.backwardWeight(output.generated); // marginFactor = 3: __generate's sin_au denominator goes through catastrophic cancellation diff --git a/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl b/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl index fb51838c7..8541bef19 100644 --- a/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl @@ -15,7 +15,6 @@ struct UniformHemisphereInputValues struct UniformHemisphereTestResults { float32_t3 generated; - float32_t pdf; float32_t2 inverted; float32_t forwardPdf; float32_t backwardPdf; diff --git a/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl b/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl index 3737f4575..fb4086e44 100644 --- a/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl @@ -15,7 +15,6 @@ struct UniformSphereInputValues struct UniformSphereTestResults { float32_t3 generated; - float32_t pdf; float32_t2 inverted; float32_t forwardPdf; float32_t backwardPdf; diff --git a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl index 438eea31e..420cbcd0b 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl @@ -5,7 +5,8 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -39,7 +40,7 @@ void main() acc ^= asuint(sampler.forwardPdf(u, cache)); } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else BilinearTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl index 1fb5f6644..3302db2e9 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl @@ -5,7 +5,8 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -39,7 +40,7 @@ void main() acc ^= asuint(sampler.forwardPdf(u, cache)); } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else BoxMullerTransformTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl index 2a7f1861e..058c3ef11 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl @@ -5,7 +5,8 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -35,7 +36,7 @@ void main() acc ^= asuint(sampling::ConcentricMapping::forwardPdf(generated, cache)); } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else ConcentricMappingTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl index 7b97645b5..acf0887e5 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl @@ -5,7 +5,8 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -39,7 +40,7 @@ void main() acc ^= asuint(sampler.forwardPdf(u, cache)); } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else LinearTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl index b5d48cc36..b12b276e3 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl @@ -5,7 +5,8 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -35,7 +36,7 @@ void main() acc ^= asuint(sampling::PolarMapping::forwardPdf(generated, cache)); } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else PolarMappingTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl index f543d6dc2..9be02b9fd 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl @@ -5,7 +5,8 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -36,7 +37,7 @@ void main() acc ^= asuint(sampler.forwardPdf(u, cache)); } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else ProjectedHemisphereTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl index ca4e7eef7..7488dc2d5 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl @@ -5,7 +5,8 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -36,7 +37,7 @@ void main() acc ^= asuint(sampler.forwardPdf(u, cache)); } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else ProjectedSphereTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl index fc4ae03b7..dd7f62db4 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl @@ -5,7 +5,8 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -66,7 +67,7 @@ void main() } } #endif - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else ProjectedSphericalRectangleTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl index e32251ed8..9ed69291a 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl @@ -5,7 +5,8 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -54,7 +55,7 @@ void main() } } #endif - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else ProjectedSphericalTriangleTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl index 542d20587..8cba7fbcb 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl @@ -5,7 +5,8 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -112,7 +113,7 @@ void main() } } #endif - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else SphericalRectangleTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl index bc55facbd..14b4843b9 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl @@ -5,7 +5,8 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -53,7 +54,7 @@ void main() } } #endif - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else SphericalTriangleTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl index c0a0e58b2..50901e481 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl @@ -5,7 +5,8 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -36,7 +37,7 @@ void main() acc ^= asuint(sampler.forwardPdf(u, cache)); } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else UniformHemisphereTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl index 1c810afbf..0351e358f 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl @@ -5,7 +5,8 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; @@ -36,7 +37,7 @@ void main() acc ^= asuint(sampler.forwardPdf(u, cache)); } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else UniformSphereTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h index d95d7f103..4f63c6fde 100644 --- a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h @@ -7,6 +7,7 @@ #include #include "nbl/examples/examples.hpp" +#include "../app_resources/common/sampler_bench_pc.hlsl" using namespace nbl; @@ -81,7 +82,12 @@ class CSamplerBenchmark }; auto dsLayout = m_device->createDescriptorSetLayout(bindings); - m_pplnLayout = m_device->createPipelineLayout({}, core::smart_refctd_ptr(dsLayout)); + const asset::SPushConstantRange pcRange = { + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, + .offset = 0, + .size = sizeof(SamplerBenchPushConstants), + }; + m_pplnLayout = m_device->createPipelineLayout({&pcRange, 1}, core::smart_refctd_ptr(dsLayout)); { video::IGPUComputePipeline::SCreationParams pparams = {}; @@ -119,13 +125,14 @@ class CSamplerBenchmark { video::IGPUBuffer::SCreationParams bparams = {}; bparams.size = data.outputBufferBytes; - bparams.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + bparams.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; outputBuf = m_device->createBuffer(std::move(bparams)); video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuf->getMemoryReqs(); reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits(); - m_outputAlloc = m_device->allocate(reqs, outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_NONE); + m_outputAlloc = m_device->allocate(reqs, outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); if (!m_outputAlloc.isValid()) m_logger->log("CSamplerBenchmark: failed to allocate output buffer memory", system::ILogger::ELL_ERROR); + m_outputAddress = outputBuf->getDeviceAddress(); } // Zero-fill the input buffer once on the GPU @@ -183,6 +190,10 @@ class CSamplerBenchmark m_benchmarkCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); m_benchmarkCmdbuf->bindComputePipeline(m_pipeline.get()); m_benchmarkCmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); + { + SamplerBenchPushConstants pc = { .outputAddress = m_outputAddress }; + m_benchmarkCmdbuf->pushConstants(m_pplnLayout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(pc), &pc); + } for (uint32_t i = 0u; i < warmupIterations; ++i) m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); @@ -233,6 +244,7 @@ class CSamplerBenchmark core::smart_refctd_ptr m_ds; video::IDeviceMemoryAllocator::SAllocation m_inputAlloc = {}; video::IDeviceMemoryAllocator::SAllocation m_outputAlloc = {}; + uint64_t m_outputAddress = 0; video::IQueue* m_queue = nullptr; video::IPhysicalDevice* m_physicalDevice = nullptr; uint32_t m_dispatchGroupCount = 0; diff --git a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h index bc74f6415..7aabc48ec 100644 --- a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h +++ b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h @@ -55,6 +55,10 @@ class CSphericalRectangleTester final : public ITester world == generate", actual.generated, actual.normalizedLocalToWorld, iteration, seed, testType, 5e-5, 5e-3); + // computeHitT(generated) must equal hitDist returned by generateNormalizedLocal + pass &= verifyTestValue("SphericalRectangle::computeHitT == hitDist", actual.computedHitT, actual.hitDist, iteration, seed, testType, 5e-4, 2e-2); + // generateUnnormalized direction must be parallel to generate() (cross product near zero) + { + const nbl::hlsl::float32_t3 c = nbl::hlsl::cross(actual.unnormalized, actual.generated); + pass &= verifyTestValue("SphericalRectangle::generateUnnormalized parallel to generate", c, nbl::hlsl::float32_t3(0.0f, 0.0f, 0.0f), iteration, seed, testType, 1e-3, 5e-2); + } + // |generateUnnormalized| must equal hitDist (distance to hitpoint along the unit ray) + { + const float ulen = nbl::hlsl::length(actual.unnormalized); + pass &= verifyTestValue("SphericalRectangle::|generateUnnormalized| == hitDist", ulen, actual.hitDist, iteration, seed, testType, 5e-4, 2e-2); + } + if (!pass && iteration < m_inputs.size()) logFailedInput(m_logger.get(), m_inputs[iteration]); diff --git a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h index b07cee739..4f80ecbaf 100644 --- a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h +++ b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h @@ -38,7 +38,6 @@ class CUniformHemisphereTester final : public ITester