diff --git a/apps/typegpu-docs/package.json b/apps/typegpu-docs/package.json index 2307fa9398..d4c7fa9481 100644 --- a/apps/typegpu-docs/package.json +++ b/apps/typegpu-docs/package.json @@ -29,10 +29,10 @@ "@stackblitz/sdk": "^1.11.0", "@tailwindcss/vite": "^4.1.18", "@typegpu/color": "workspace:*", - "@typegpu/concurrent-scan": "workspace:*", "@typegpu/geometry": "workspace:*", "@typegpu/noise": "workspace:*", "@typegpu/sdf": "workspace:*", + "@typegpu/sort": "workspace:*", "@typegpu/three": "workspace:*", "@types/react": "^19.1.8", "@types/react-dom": "^19.1.6", diff --git a/apps/typegpu-docs/src/examples/algorithms/bitonic-sort/index.html b/apps/typegpu-docs/src/examples/algorithms/bitonic-sort/index.html new file mode 100644 index 0000000000..65c5005787 --- /dev/null +++ b/apps/typegpu-docs/src/examples/algorithms/bitonic-sort/index.html @@ -0,0 +1,48 @@ + + + diff --git a/apps/typegpu-docs/src/examples/algorithms/bitonic-sort/index.ts b/apps/typegpu-docs/src/examples/algorithms/bitonic-sort/index.ts new file mode 100644 index 0000000000..9454374421 --- /dev/null +++ b/apps/typegpu-docs/src/examples/algorithms/bitonic-sort/index.ts @@ -0,0 +1,282 @@ +import tgpu, { d, std } from 'typegpu'; +import { + type BitonicSorter, + type BitonicSorterOptions, + createBitonicSorter, + decomposeWorkgroups, +} from '@typegpu/sort'; +import { randf } from '@typegpu/noise'; +import { fullScreenTriangle } from 'typegpu/common'; +import { defineControls } from '../../common/defineControls.ts'; + +const maxBufferSize = await navigator.gpu.requestAdapter().then((adapter) => { + if (!adapter) { + throw new Error('No GPU adapter found'); + } + const limits = adapter.limits; + return Math.min(limits.maxStorageBufferBindingSize, limits.maxBufferSize); +}); + +const root = await tgpu.init({ + device: { + optionalFeatures: ['timestamp-query'], + requiredLimits: { + maxStorageBufferBindingSize: maxBufferSize, + maxBufferSize: maxBufferSize, + }, + }, +}); +const hasTimestampQuery = root.enabledFeatures.has('timestamp-query'); +const querySet = hasTimestampQuery ? root.createQuerySet('timestamp', 2) : null; + +const canvas = document.querySelector('canvas') as HTMLCanvasElement; +const context = root.configureContext({ canvas }); + +const presentationFormat = navigator.gpu.getPreferredCanvasFormat(); + +const maxSide = Math.floor(Math.sqrt(maxBufferSize / 4)); +const minLog = 2; // log_2(4) +const maxLog = Math.floor(Math.log2(maxSide)); +const arraySizeOptions = Array.from({ length: 8 }, (_, i) => { + const side = Math.round(2 ** (minLog + (i * (maxLog - minLog)) / 7)); + return side * side; +}); + +type SortOrderKey = 'ascending' | 'descending' | 'bit-reversed' | 'xor-scatter'; + +const sortOrders: Record = { + ascending: {}, + descending: { + compare: (a, b) => { + 'use gpu'; + return a > b; + }, + paddingValue: 0, + }, + 'bit-reversed': { + compare: (a, b) => { + 'use gpu'; + return std.reverseBits(a) < std.reverseBits(b); + }, + }, + 'xor-scatter': { + compare: (a, b) => { + 'use gpu'; + return (a ^ 0xaa) < (b ^ 0xaa); + }, + }, +}; + +const state = { + arraySize: arraySizeOptions[2], + sortOrder: 'ascending' as SortOrderKey, +}; + +const WORKGROUP_SIZE = 256; + +const renderLayout = tgpu.bindGroupLayout({ + data: { + storage: d.arrayOf(d.u32), + access: 'readonly', + }, +}); + +const initLayout = tgpu.bindGroupLayout({ + data: { + storage: d.arrayOf(d.u32), + access: 'mutable', + }, +}); + +const initLength = root.createUniform(d.u32, state.arraySize); +const initSeed = root.createUniform(d.f32, 0); + +const fragmentFn = tgpu.fragmentFn({ + in: { uv: d.vec2f }, + out: d.vec4f, +})((input) => { + const arrayLength = initLength.$; + + const cols = d.u32(std.round(std.sqrt(d.f32(arrayLength)))); + const rows = d.u32(std.round(arrayLength / cols)); + + const col = d.u32(std.floor(input.uv.x * d.f32(cols))); + const row = d.u32(std.floor(input.uv.y * d.f32(rows))); + const idx = row * cols + col; + + if (idx >= arrayLength) { + return d.vec4f(0.1, 0.1, 0.1, 1); + } + + const value = renderLayout.$.data[idx]; + const normalized = value / 255; + + return d.vec4f(normalized, normalized, normalized, 1); +}); + +const initKernel = tgpu.computeFn({ + workgroupSize: [WORKGROUP_SIZE], + in: { + gid: d.builtin.globalInvocationId, + numWorkgroups: d.builtin.numWorkgroups, + }, +})((input) => { + const spanX = input.numWorkgroups.x * WORKGROUP_SIZE; + const spanY = input.numWorkgroups.y * spanX; + const idx = input.gid.x + input.gid.y * spanX + input.gid.z * spanY; + + if (idx >= initLength.$) { + return; + } + + randf.seed3(d.vec3f(d.f32(idx & 0xffff), d.f32(idx >> 16), initSeed.$)); + const n = randf.sample(); + initLayout.$.data[idx] = d.u32(std.floor(n * 256.0)); +}); + +const renderPipeline = root.createRenderPipeline({ + vertex: fullScreenTriangle, + fragment: fragmentFn, + targets: { format: presentationFormat }, +}); + +const initPipeline = root.createComputePipeline({ compute: initKernel }); + +let buffer = root.createBuffer(d.arrayOf(d.u32, state.arraySize)).$usage('storage'); + +let bindGroup = root.createBindGroup(renderLayout, { + data: buffer, +}); +let initBindGroup = root.createBindGroup(initLayout, { + data: buffer, +}); + +function createSorters(buf: typeof buffer) { + return Object.fromEntries( + Object.entries(sortOrders).map(([key, opts]) => [key, createBitonicSorter(root, buf, opts)]), + ) as Record; +} + +let sorters = createSorters(buffer); + +function recreateBuffer() { + for (const s of Object.values(sorters)) { + s.destroy(); + } + buffer.destroy(); + + buffer = root.createBuffer(d.arrayOf(d.u32, state.arraySize)).$usage('storage'); + + bindGroup = root.createBindGroup(renderLayout, { + data: buffer, + }); + + initBindGroup = root.createBindGroup(initLayout, { + data: buffer, + }); + + sorters = createSorters(buffer); +} + +function generateRandomArray() { + const workgroupsTotal = Math.ceil(state.arraySize / WORKGROUP_SIZE); + const [workgroupsX, workgroupsY, workgroupsZ] = decomposeWorkgroups(workgroupsTotal); + + initLength.write(state.arraySize); + initSeed.write(Math.random() * 1000); + + initPipeline.with(initBindGroup).dispatchWorkgroups(workgroupsX, workgroupsY, workgroupsZ); + + render(); +} + +function render() { + renderPipeline + .withColorAttachment({ + view: context.getCurrentTexture().createView(), + loadOp: 'clear', + storeOp: 'store', + }) + .with(bindGroup) + .draw(3); +} + +const overlay = document.getElementById('sort-overlay') as HTMLDivElement; +const spinnerEl = document.getElementById('sort-spinner') as HTMLDivElement; +const statusEl = document.getElementById('sort-status') as HTMLSpanElement; +canvas.parentElement?.appendChild(overlay); + +function showOverlay(text: string, showSpinner = true) { + spinnerEl.hidden = !showSpinner; + statusEl.textContent = text; + overlay.hidden = false; + overlay.classList.add('visible'); +} + +function hideOverlay(delayMs = 1500) { + setTimeout(() => { + overlay.classList.remove('visible'); + overlay.addEventListener('transitionend', () => (overlay.hidden = true), { + once: true, + }); + }, delayMs); +} + +async function sort() { + const sorter = sorters[state.sortOrder]; + + showOverlay('Sorting...'); + sorter.run({ querySet: querySet ?? undefined }); + + let gpuTimeMs: number | null = null; + if (querySet?.available) { + querySet.resolve(); + const timestamps = await querySet.read(); + gpuTimeMs = Number(timestamps[1] - timestamps[0]) / 1_000_000; + } + + render(); + + const timeStr = + gpuTimeMs !== null + ? ` in ${ + gpuTimeMs >= 1000 ? `${(gpuTimeMs / 1000).toFixed(2)}s` : `${gpuTimeMs.toFixed(2)}ms` + }` + : ''; + showOverlay(`\u2714 Sorted${timeStr}`, false); + hideOverlay(); +} + +// #region Example controls & Cleanup + +const sortOrderKeys = Object.keys(sortOrders) as SortOrderKey[]; + +export const controls = defineControls({ + 'Array Size': { + initial: arraySizeOptions[2], + options: arraySizeOptions, + onSelectChange: (value) => { + state.arraySize = isNaN(value) ? 64 : value; + recreateBuffer(); + generateRandomArray(); + }, + }, + 'Sort Order': { + initial: 'ascending', + options: sortOrderKeys, + onSelectChange: (value) => { + state.sortOrder = value; + }, + }, + Reshuffle: { onButtonClick: generateRandomArray }, + Sort: { onButtonClick: sort }, +}); + +export function onCleanup() { + for (const s of Object.values(sorters)) { + s.destroy(); + } + root.destroy(); +} + +// #endregion diff --git a/apps/typegpu-docs/src/examples/algorithms/bitonic-sort/meta.json b/apps/typegpu-docs/src/examples/algorithms/bitonic-sort/meta.json new file mode 100644 index 0000000000..5d6cad56ce --- /dev/null +++ b/apps/typegpu-docs/src/examples/algorithms/bitonic-sort/meta.json @@ -0,0 +1,5 @@ +{ + "title": "Bitonic Sort", + "category": "algorithms", + "tags": ["experimental", "compute"] +} diff --git a/apps/typegpu-docs/src/examples/algorithms/bitonic-sort/thumbnail.png b/apps/typegpu-docs/src/examples/algorithms/bitonic-sort/thumbnail.png new file mode 100644 index 0000000000..0fdfbe06ef Binary files /dev/null and b/apps/typegpu-docs/src/examples/algorithms/bitonic-sort/thumbnail.png differ diff --git a/apps/typegpu-docs/src/examples/algorithms/concurrent-chart/calculator.ts b/apps/typegpu-docs/src/examples/algorithms/concurrent-chart/calculator.ts index 1c2a7d6196..737c1e69d7 100644 --- a/apps/typegpu-docs/src/examples/algorithms/concurrent-chart/calculator.ts +++ b/apps/typegpu-docs/src/examples/algorithms/concurrent-chart/calculator.ts @@ -1,4 +1,4 @@ -import { initCache, prefixScan } from '@typegpu/concurrent-scan'; +import { createPrefixScanComputer, prefixScan } from '@typegpu/sort'; import type { TgpuRoot } from 'typegpu'; import { d, std } from 'typegpu'; @@ -45,7 +45,7 @@ export async function performCalculationsWithTime( const jsTime = performance.now() - jsStartTime; // GPU version - initCache(root, { operation: std.add, identityElement: 0 }); + createPrefixScanComputer(root, { operation: std.add, identityElement: 0 }); const querySet = root.createQuerySet('timestamp', 2); const gpuStartTime = performance.now(); const calcResult = prefixScan( diff --git a/apps/typegpu-docs/src/examples/tests/prefix-scan/functions.ts b/apps/typegpu-docs/src/examples/tests/prefix-scan/functions.ts index df00024520..9f086f1cf9 100644 --- a/apps/typegpu-docs/src/examples/tests/prefix-scan/functions.ts +++ b/apps/typegpu-docs/src/examples/tests/prefix-scan/functions.ts @@ -1,7 +1,7 @@ import tgpu from 'typegpu'; import * as d from 'typegpu/data'; import * as std from 'typegpu/std'; -import type { BinaryOp } from '@typegpu/concurrent-scan'; +import type { BinaryOp } from '@typegpu/sort'; // tgpu functions diff --git a/apps/typegpu-docs/src/examples/tests/prefix-scan/index.ts b/apps/typegpu-docs/src/examples/tests/prefix-scan/index.ts index dd14e58cbf..87c3d2b779 100644 --- a/apps/typegpu-docs/src/examples/tests/prefix-scan/index.ts +++ b/apps/typegpu-docs/src/examples/tests/prefix-scan/index.ts @@ -1,6 +1,6 @@ import tgpu from 'typegpu'; import * as d from 'typegpu/data'; -import { type BinaryOp, prefixScan, scan } from '@typegpu/concurrent-scan'; +import { type BinaryOp, prefixScan, scan } from '@typegpu/sort'; import * as std from 'typegpu/std'; import { addFn, concat10, isArrayEqual, mulFn, prefixScanJS, scanJS } from './functions.ts'; diff --git a/apps/typegpu-docs/src/utils/examples/sandboxModules.ts b/apps/typegpu-docs/src/utils/examples/sandboxModules.ts index f4fde81f11..e98c8d92f5 100644 --- a/apps/typegpu-docs/src/utils/examples/sandboxModules.ts +++ b/apps/typegpu-docs/src/utils/examples/sandboxModules.ts @@ -137,11 +137,11 @@ export const SANDBOX_MODULES: Record = { import: { reroute: 'typegpu-color/src/index.ts' }, typeDef: { reroute: 'typegpu-color/src/index.ts' }, }, - '@typegpu/concurrent-scan': { - import: { reroute: 'typegpu-concurrent-scan/src/index.ts' }, - typeDef: { reroute: 'typegpu-concurrent-scan/src/index.ts' }, - }, '@typegpu/three': { typeDef: { reroute: 'typegpu-three/src/index.ts' }, }, + '@typegpu/sort': { + import: { reroute: 'typegpu-sort/src/index.ts' }, + typeDef: { reroute: 'typegpu-sort/src/index.ts' }, + }, }; diff --git a/packages/typegpu-concurrent-scan/README.md b/packages/typegpu-concurrent-scan/README.md deleted file mode 100644 index ac1f27babc..0000000000 --- a/packages/typegpu-concurrent-scan/README.md +++ /dev/null @@ -1,39 +0,0 @@ -
- -# @typegpu/concurrent-scan - -
- -A concurrent scan module. For use in WebGPU/TypeGPU apps. Example usage: - -```ts -const calcResult = prefixScan( - root, - { - inputBuffer, - operation: std.add, - identityElement: 0, - }, -); -``` - -Example usage (only the greatest element + timing the shader): - -```ts -// Note: 'timestamp-query' must be requested when initialising the root -const querySet = root.createQuerySet('timestamp', 2); - -const calcResult = scan( - root, - { - inputBuffer, - operation: std.mul, - identityElement: 1, - }, - querySet, -); - -querySet.resolve(); -const [start, end] = await querySet.read(); -const gpuTimeMs = Number(end - start) / 1_000_000; -``` diff --git a/packages/typegpu-concurrent-scan/src/index.ts b/packages/typegpu-concurrent-scan/src/index.ts deleted file mode 100644 index 7cdfcab4c5..0000000000 --- a/packages/typegpu-concurrent-scan/src/index.ts +++ /dev/null @@ -1 +0,0 @@ -export * from './concurrentScan.ts'; diff --git a/packages/typegpu-sort/README.md b/packages/typegpu-sort/README.md new file mode 100644 index 0000000000..ec0ca489e7 --- /dev/null +++ b/packages/typegpu-sort/README.md @@ -0,0 +1,57 @@ +
+ +# @typegpu/sort + +🚧 **Under Construction** 🚧 + +
+ +GPU sorting and scanning algorithms for TypeGPU. + +## Bitonic Sort + +Sorts a `u32` storage buffer in-place. Arrays with non-power-of-2 lengths are padded automatically. + +```ts +import tgpu, { d } from 'typegpu'; +import { createBitonicSorter } from '@typegpu/sort'; + +const root = await tgpu.init(); +const buffer = root.createBuffer(d.arrayOf(d.u32, 1024), data).$usage('storage'); + +const sorter = createBitonicSorter(root, buffer); +sorter.run(); +sorter.destroy(); +``` + +Custom comparator (descending): + +```ts +const sorter = createBitonicSorter(root, buffer, { + compare: (a, b) => { 'use gpu'; return a > b; }, + paddingValue: 0, // must sort to the end — use 0 for descending +}); +``` + +With GPU timing (`timestamp-query` feature required): + +```ts +const querySet = root.createQuerySet('timestamp', 2); +sorter.run({ querySet }); +querySet.resolve(); +const [start, end] = await querySet.read(); +const gpuTimeMs = Number(end - start) / 1_000_000; +``` + +## Prefix Scan + +```ts +import { prefixScan, scan } from '@typegpu/sort'; +import * as std from 'typegpu/std'; + +// Full prefix scan +const result = prefixScan(root, { inputBuffer, operation: std.add, identityElement: 0 }); + +// Reduction only (returns the final aggregated value) +const total = scan(root, { inputBuffer, operation: std.add, identityElement: 0 }); +``` diff --git a/packages/typegpu-concurrent-scan/build.config.ts b/packages/typegpu-sort/build.config.ts similarity index 56% rename from packages/typegpu-concurrent-scan/build.config.ts rename to packages/typegpu-sort/build.config.ts index 7f9f024f1f..4fcfc98991 100644 --- a/packages/typegpu-concurrent-scan/build.config.ts +++ b/packages/typegpu-sort/build.config.ts @@ -1,12 +1,10 @@ -import { type BuildConfig, defineBuildConfig } from 'unbuild'; +import { defineBuildConfig } from 'unbuild'; import typegpu from 'unplugin-typegpu/rollup'; -const Config: BuildConfig[] = defineBuildConfig({ +export default defineBuildConfig({ hooks: { 'rollup:options': (_options, config) => { config.plugins.push(typegpu({ include: [/\.ts$/] })); }, }, }); - -export default Config; diff --git a/packages/typegpu-concurrent-scan/deno.json b/packages/typegpu-sort/deno.json similarity index 65% rename from packages/typegpu-concurrent-scan/deno.json rename to packages/typegpu-sort/deno.json index 66699a4b54..734470f645 100644 --- a/packages/typegpu-concurrent-scan/deno.json +++ b/packages/typegpu-sort/deno.json @@ -1,7 +1,7 @@ { "exclude": ["."], "fmt": { - "exclude": ["!."], + "exclude": ["!.", "./dist"], "singleQuote": true } } diff --git a/packages/typegpu-concurrent-scan/package.json b/packages/typegpu-sort/package.json similarity index 88% rename from packages/typegpu-concurrent-scan/package.json rename to packages/typegpu-sort/package.json index bf001c86d2..a1cd101f20 100644 --- a/packages/typegpu-concurrent-scan/package.json +++ b/packages/typegpu-sort/package.json @@ -1,7 +1,7 @@ { - "name": "@typegpu/concurrent-scan", - "version": "0.10.0", - "description": "A concurrent scan module.", + "name": "@typegpu/sort", + "version": "0.9.0", + "description": "GPU sorting and scanning algorithms implemented using TypeGPU.", "keywords": [], "license": "MIT", "type": "module", diff --git a/packages/typegpu-sort/src/bitonic/bitonicSort.ts b/packages/typegpu-sort/src/bitonic/bitonicSort.ts new file mode 100644 index 0000000000..2392faf12c --- /dev/null +++ b/packages/typegpu-sort/src/bitonic/bitonicSort.ts @@ -0,0 +1,293 @@ +import tgpu, { + d, + std, + type StorageFlag, + type TgpuBindGroup, + type TgpuBuffer, + type TgpuRoot, + type UniformFlag, +} from 'typegpu'; +import { compareSlot, defaultCompare } from './slots.ts'; +import type { BitonicSorter, BitonicSorterOptions, BitonicSorterRunOptions } from './types.ts'; +import { decomposeWorkgroups, nextPowerOf2 } from './utils.ts'; + +const WORKGROUP_SIZE = 256; + +const copyParamsType = d.struct({ + srcLength: d.u32, + dstLength: d.u32, + paddingValue: d.u32, +}); + +const sortUniformsType = d.struct({ + k: d.u32, + jShift: d.u32, +}); + +const sortLayout = tgpu.bindGroupLayout({ + data: { + storage: d.arrayOf(d.u32), + access: 'mutable', + }, + uniforms: { + uniform: sortUniformsType, + }, +}); + +const copyLayout = tgpu.bindGroupLayout({ + src: { + storage: d.arrayOf(d.u32), + access: 'readonly', + }, + dst: { + storage: d.arrayOf(d.u32), + access: 'mutable', + }, + params: { + uniform: copyParamsType, + }, +}); + +const copyPadKernel = tgpu['~unstable'].computeFn({ + workgroupSize: [WORKGROUP_SIZE], + in: { + gid: d.builtin.globalInvocationId, + numWorkgroups: d.builtin.numWorkgroups, + }, +})((input) => { + const spanX = input.numWorkgroups.x * WORKGROUP_SIZE; + const spanY = input.numWorkgroups.y * spanX; + + const idx = input.gid.x + input.gid.y * spanX + input.gid.z * spanY; + + const dstLength = copyLayout.$.params.dstLength; + const srcLength = copyLayout.$.params.srcLength; + + if (idx >= dstLength) { + return; + } + + copyLayout.$.dst[idx] = std.select( + copyLayout.$.params.paddingValue, + copyLayout.$.src[idx] as number, + idx < srcLength, + ); +}); + +const copyBackKernel = tgpu['~unstable'].computeFn({ + workgroupSize: [WORKGROUP_SIZE], + in: { + gid: d.builtin.globalInvocationId, + numWorkgroups: d.builtin.numWorkgroups, + }, +})((input) => { + const spanX = input.numWorkgroups.x * WORKGROUP_SIZE; + const spanY = input.numWorkgroups.y * spanX; + + const idx = input.gid.x + input.gid.y * spanX + input.gid.z * spanY; + + if (idx < copyLayout.$.params.srcLength) { + copyLayout.$.dst[idx] = copyLayout.$.src[idx] as number; + } +}); + +const bitonicStepKernel = tgpu['~unstable'].computeFn({ + workgroupSize: [WORKGROUP_SIZE], + in: { + gid: d.builtin.globalInvocationId, + numWorkgroups: d.builtin.numWorkgroups, + }, +})((input) => { + const spanX = input.numWorkgroups.x * WORKGROUP_SIZE; + const spanY = input.numWorkgroups.y * spanX; + + const tid = input.gid.x + input.gid.y * spanX + input.gid.z * spanY; + + const k = sortLayout.$.uniforms.k; + const shift = sortLayout.$.uniforms.jShift; + const dataLength = d.u32(sortLayout.$.data.length); + const stride = 1 << shift; + + const maskBelow = stride - 1; + const below = tid & maskBelow; + const above = tid >> shift; + + const i = below + above * (stride << 1); + const ixj = i + stride; + + if (ixj >= dataLength) { + return; + } + + const ascending = (i & k) === 0; + const left = sortLayout.$.data[i] as number; + const right = sortLayout.$.data[ixj] as number; + + const leftFirst = compareSlot.$(left, right); + const shouldSwap = std.select(leftFirst, !leftFirst, ascending); + + if (shouldSwap) { + sortLayout.$.data[i] = right; + sortLayout.$.data[ixj] = left; + } +}); + +export function createBitonicSorter( + root: TgpuRoot, + data: TgpuBuffer> & StorageFlag, + options?: BitonicSorterOptions, +): BitonicSorter { + const originalSize = data.dataType.elementCount; + const paddedSize = nextPowerOf2(originalSize); + const wasPadded = paddedSize !== originalSize; + + const paddingValue = options?.paddingValue ?? 0xffffffff; + const compareFunc = options?.compare ?? defaultCompare; + + let paddingResources: { + workBuffer: TgpuBuffer> & StorageFlag; + copyPadParams: TgpuBuffer & UniformFlag; + copyBackParams: TgpuBuffer & UniformFlag; + copyPadBindGroup: TgpuBindGroup<(typeof copyLayout)['entries']>; + copyBackBindGroup: TgpuBindGroup<(typeof copyLayout)['entries']>; + } | null = null; + let workBuffer: TgpuBuffer> & StorageFlag; + + const sortWorkgroupsTotal = Math.ceil(paddedSize / 2 / WORKGROUP_SIZE); + const [sortWorkgroupsX, sortWorkgroupsY, sortWorkgroupsZ] = + decomposeWorkgroups(sortWorkgroupsTotal); + + const padWorkgroupsTotal = Math.ceil(paddedSize / WORKGROUP_SIZE); + const [padWorkgroupsX, padWorkgroupsY, padWorkgroupsZ] = decomposeWorkgroups(padWorkgroupsTotal); + + const copyBackWorkgroupsTotal = Math.ceil(originalSize / WORKGROUP_SIZE); + const [copyBackWorkgroupsX, copyBackWorkgroupsY, copyBackWorkgroupsZ] = + decomposeWorkgroups(copyBackWorkgroupsTotal); + + if (wasPadded) { + const paddedWorkBuffer = root.createBuffer(d.arrayOf(d.u32, paddedSize)).$usage('storage'); + + const copyPadParams = root + .createBuffer(copyParamsType, { + srcLength: originalSize, + dstLength: paddedSize, + paddingValue, + }) + .$usage('uniform'); + + const copyBackParams = root + .createBuffer(copyParamsType, { + srcLength: originalSize, + dstLength: originalSize, + paddingValue: 0, + }) + .$usage('uniform'); + + paddingResources = { + workBuffer: paddedWorkBuffer, + copyPadParams, + copyBackParams, + copyPadBindGroup: root.createBindGroup(copyLayout, { + src: data, + dst: paddedWorkBuffer, + params: copyPadParams, + }), + copyBackBindGroup: root.createBindGroup(copyLayout, { + src: paddedWorkBuffer, + dst: data, + params: copyBackParams, + }), + }; + + workBuffer = paddedWorkBuffer; + } else { + workBuffer = data; + } + + const uniformBuffer = root.createBuffer(sortUniformsType).$usage('uniform'); + + const sortBindGroup = root.createBindGroup(sortLayout, { + data: workBuffer, + uniforms: uniformBuffer, + }); + + const sortPipeline = root['~unstable'] + .with(compareSlot, compareFunc) + .withCompute(bitonicStepKernel) + .createPipeline(); + + const copyPadPipeline = root['~unstable'].withCompute(copyPadKernel).createPipeline(); + + const copyBackPipeline = root['~unstable'].withCompute(copyBackKernel).createPipeline(); + + const log2N = Math.log2(paddedSize); + const totalSteps = (log2N * (log2N + 1)) / 2; + + function run(runOptions?: BitonicSorterRunOptions): void { + const querySet = runOptions?.querySet; + + if (paddingResources) { + let pipeline = copyPadPipeline.with(paddingResources.copyPadBindGroup); + if (querySet) { + pipeline = pipeline.withTimestampWrites({ + querySet, + beginningOfPassWriteIndex: 0, + }); + } + pipeline.dispatchWorkgroups(padWorkgroupsX, padWorkgroupsY, padWorkgroupsZ); + } + + let stepIndex = 0; + for (let k = 2; k <= paddedSize; k <<= 1) { + for (let j = k >> 1; j > 0; j >>= 1) { + const jShift = 31 - Math.clz32(j); + uniformBuffer.write({ k, jShift }); + + let pipeline = sortPipeline.with(sortBindGroup); + + if (querySet && !paddingResources) { + const isFirst = stepIndex === 0; + const isLast = stepIndex === totalSteps - 1; + if (isFirst || isLast) { + pipeline = pipeline.withTimestampWrites({ + querySet, + ...(isFirst && { beginningOfPassWriteIndex: 0 }), + ...(isLast && { endOfPassWriteIndex: 1 }), + }); + } + } + + pipeline.dispatchWorkgroups(sortWorkgroupsX, sortWorkgroupsY, sortWorkgroupsZ); + stepIndex++; + } + } + + if (paddingResources) { + let pipeline = copyBackPipeline.with(paddingResources.copyBackBindGroup); + if (querySet) { + pipeline = pipeline.withTimestampWrites({ + querySet, + endOfPassWriteIndex: 1, + }); + } + pipeline.dispatchWorkgroups(copyBackWorkgroupsX, copyBackWorkgroupsY, copyBackWorkgroupsZ); + } + } + + function destroy(): void { + uniformBuffer.destroy(); + if (paddingResources) { + paddingResources.workBuffer.destroy(); + paddingResources.copyPadParams.destroy(); + paddingResources.copyBackParams.destroy(); + } + } + + return { + originalSize, + paddedSize, + wasPadded, + run, + destroy, + }; +} diff --git a/packages/typegpu-sort/src/bitonic/index.ts b/packages/typegpu-sort/src/bitonic/index.ts new file mode 100644 index 0000000000..5e28296880 --- /dev/null +++ b/packages/typegpu-sort/src/bitonic/index.ts @@ -0,0 +1,4 @@ +export { createBitonicSorter } from './bitonicSort.ts'; +export { compareSlot, defaultCompare } from './slots.ts'; +export type { BitonicSorter, BitonicSorterOptions, BitonicSorterRunOptions } from './types.ts'; +export { decomposeWorkgroups } from './utils.ts'; diff --git a/packages/typegpu-sort/src/bitonic/slots.ts b/packages/typegpu-sort/src/bitonic/slots.ts new file mode 100644 index 0000000000..edf09b21c2 --- /dev/null +++ b/packages/typegpu-sort/src/bitonic/slots.ts @@ -0,0 +1,8 @@ +import tgpu, { d } from 'typegpu'; + +/** Default comparison function: ascending order (a < b means a comes before b) */ +export const defaultCompare = tgpu.fn([d.u32, d.u32], d.bool)((a, b) => a < b); + +/** Slot for customizing the comparison function in bitonic sort. + * The function should return true if the first argument should come before the second. */ +export const compareSlot = tgpu.slot<(a: number, b: number) => boolean>(defaultCompare); diff --git a/packages/typegpu-sort/src/bitonic/types.ts b/packages/typegpu-sort/src/bitonic/types.ts new file mode 100644 index 0000000000..17fa03b41a --- /dev/null +++ b/packages/typegpu-sort/src/bitonic/types.ts @@ -0,0 +1,36 @@ +import type { TgpuQuerySet } from 'typegpu'; + +export interface BitonicSorterOptions { + /** Custom comparison function. Returns true if first argument should come before second. + * Default: ascending order (a < b) */ + compare?: (a: number, b: number) => boolean; + /** + * Value used to pad arrays to power-of-2 length. Must sort to the end with your comparator. + * Default: `0xFFFFFFFF` (works for ascending). For descending order, use `0`. + */ + paddingValue?: number; +} + +export interface BitonicSorterRunOptions { + /** + * Optional timestamp query set for GPU timing. Must have at least 2 entries. + * Timestamps are written to indices 0 and 1. For non-power-of-2 arrays, timing + * includes the padding copy passes. + */ + querySet?: TgpuQuerySet<'timestamp'>; +} + +export interface BitonicSorter { + /** Original size of the input array */ + readonly originalSize: number; + /** Size after padding to power of 2 */ + readonly paddedSize: number; + /** Whether the array was padded */ + readonly wasPadded: boolean; + + /** Execute the sort. Can be called repeatedly. */ + run(options?: BitonicSorterRunOptions): void; + + /** Clean up all GPU resources. */ + destroy(): void; +} diff --git a/packages/typegpu-sort/src/bitonic/utils.ts b/packages/typegpu-sort/src/bitonic/utils.ts new file mode 100644 index 0000000000..8961c7507c --- /dev/null +++ b/packages/typegpu-sort/src/bitonic/utils.ts @@ -0,0 +1,39 @@ +/** + * Returns the next power of 2 greater than or equal to n. + * If n is already a power of 2, returns n. + */ +export function nextPowerOf2(n: number): number { + if (n <= 0) return 1; + if ((n & (n - 1)) === 0) return n; + let p = 1; + while (p < n) p <<= 1; + return p; +} + +const MAX_WORKGROUPS_PER_DIMENSION = 65535; + +/** + * Decomposes a total workgroup count into a 3D dispatch grid (x, y, z), + * respecting the WebGPU limit of 65535 workgroups per dimension. + */ +export function decomposeWorkgroups(total: number): [number, number, number] { + if (total <= 0) { + return [1, 1, 1]; + } + + const x = Math.min(total, MAX_WORKGROUPS_PER_DIMENSION); + const remainingAfterX = Math.ceil(total / x); + + const y = Math.min(remainingAfterX, MAX_WORKGROUPS_PER_DIMENSION); + const remainingAfterY = Math.ceil(remainingAfterX / y); + + const z = Math.min(remainingAfterY, MAX_WORKGROUPS_PER_DIMENSION); + + if (Math.ceil(total / (x * y * z)) > 1) { + throw new Error( + `Required workgroups (${total}) exceed device dispatch limits (${MAX_WORKGROUPS_PER_DIMENSION} per dimension)`, + ); + } + + return [x, y, z]; +} diff --git a/packages/typegpu-sort/src/index.ts b/packages/typegpu-sort/src/index.ts new file mode 100644 index 0000000000..6268f407c2 --- /dev/null +++ b/packages/typegpu-sort/src/index.ts @@ -0,0 +1,14 @@ +export { + compareSlot, + createBitonicSorter, + defaultCompare, + decomposeWorkgroups, +} from './bitonic/index.ts'; +export type { + BitonicSorter, + BitonicSorterOptions, + BitonicSorterRunOptions, +} from './bitonic/index.ts'; + +export { prefixScan, scan, createPrefixScanComputer, PrefixScanComputer } from './scan/index.ts'; +export type { BinaryOp } from './scan/index.ts'; diff --git a/packages/typegpu-concurrent-scan/src/compute/applySums.ts b/packages/typegpu-sort/src/scan/compute/applySums.ts similarity index 92% rename from packages/typegpu-concurrent-scan/src/compute/applySums.ts rename to packages/typegpu-sort/src/scan/compute/applySums.ts index 40a10acfc0..2b3ec63c78 100644 --- a/packages/typegpu-concurrent-scan/src/compute/applySums.ts +++ b/packages/typegpu-sort/src/scan/compute/applySums.ts @@ -1,5 +1,4 @@ -import tgpu from 'typegpu'; -import * as d from 'typegpu/data'; +import tgpu, { d } from 'typegpu'; import { operatorSlot, uniformOpLayout, WORKGROUP_SIZE } from '../schemas.ts'; export const uniformOp = tgpu['~unstable'].computeFn({ diff --git a/packages/typegpu-concurrent-scan/src/compute/scan.ts b/packages/typegpu-sort/src/scan/compute/scan.ts similarity index 95% rename from packages/typegpu-concurrent-scan/src/compute/scan.ts rename to packages/typegpu-sort/src/scan/compute/scan.ts index d8ca3bc282..9f7b1cbe3d 100644 --- a/packages/typegpu-concurrent-scan/src/compute/scan.ts +++ b/packages/typegpu-sort/src/scan/compute/scan.ts @@ -1,4 +1,4 @@ -import tgpu from 'typegpu'; +import tgpu, { d, std } from 'typegpu'; import { identitySlot, onlyGreatestElementSlot, @@ -6,8 +6,6 @@ import { scanLayout, WORKGROUP_SIZE, } from '../schemas.ts'; -import * as d from 'typegpu/data'; -import * as std from 'typegpu/std'; import { downsweep, upsweep, workgroupMemory } from './shared.ts'; const fillIdentityArray = tgpu.comptime(() => Array.from({ length: 8 }, () => identitySlot.$)); diff --git a/packages/typegpu-concurrent-scan/src/compute/shared.ts b/packages/typegpu-sort/src/scan/compute/shared.ts similarity index 92% rename from packages/typegpu-concurrent-scan/src/compute/shared.ts rename to packages/typegpu-sort/src/scan/compute/shared.ts index 9abf6dea86..6d6b78322c 100644 --- a/packages/typegpu-concurrent-scan/src/compute/shared.ts +++ b/packages/typegpu-sort/src/scan/compute/shared.ts @@ -1,6 +1,4 @@ -import tgpu from 'typegpu'; -import * as d from 'typegpu/data'; -import * as std from 'typegpu/std'; +import tgpu, { d, std } from 'typegpu'; import { operatorSlot, WORKGROUP_SIZE } from '../schemas.ts'; export const workgroupMemory = tgpu.workgroupVar(d.arrayOf(d.f32, WORKGROUP_SIZE)); diff --git a/packages/typegpu-sort/src/scan/index.ts b/packages/typegpu-sort/src/scan/index.ts new file mode 100644 index 0000000000..20fb09d5ed --- /dev/null +++ b/packages/typegpu-sort/src/scan/index.ts @@ -0,0 +1,2 @@ +export { prefixScan, scan, createPrefixScanComputer, PrefixScanComputer } from './prefixScan.ts'; +export type { BinaryOp } from './types.ts'; diff --git a/packages/typegpu-concurrent-scan/src/concurrentScan.ts b/packages/typegpu-sort/src/scan/prefixScan.ts similarity index 96% rename from packages/typegpu-concurrent-scan/src/concurrentScan.ts rename to packages/typegpu-sort/src/scan/prefixScan.ts index 42323b4050..5cd17e1aee 100644 --- a/packages/typegpu-concurrent-scan/src/concurrentScan.ts +++ b/packages/typegpu-sort/src/scan/prefixScan.ts @@ -1,14 +1,14 @@ -import type { - StorageFlag, - TgpuBuffer, - TgpuComputePipeline, - TgpuFn, - TgpuQuerySet, - TgpuRoot, +import { + type StorageFlag, + type TgpuBuffer, + type TgpuComputePipeline, + type TgpuFn, + type TgpuQuerySet, + type TgpuRoot, + d, } from 'typegpu'; -import * as d from 'typegpu/data'; +import type { BinaryOp } from './types.ts'; import { - type BinaryOp, identitySlot, onlyGreatestElementSlot, operatorSlot, @@ -224,7 +224,7 @@ export function prefixScan( /** * Compute only the aggregated reduction result for `inputBuffer` using the provided operation. * Returns only the top-level sums/reductions instead of the full scan. This is useful when - * you only need the final reduction - for instance, the sum of the whole array). + * you only need the final reduction - for instance, the sum of the whole array. * * @param root - The TypeGPU root/context used to create pipelines, bind groups and buffers. * @param options - Configuration object containing: @@ -288,7 +288,7 @@ function runScan( onlyGreatestElement: boolean, querySet?: TgpuQuerySet<'timestamp'>, ): TgpuBuffer> & StorageFlag { - const computer = initCache(root, { + const computer = createPrefixScanComputer(root, { operation: options.operation, identityElement: options.identityElement, }); @@ -312,7 +312,7 @@ function runScan( * @param binaryOp - The binary operation used by the computer. * @returns A `PrefixScanComputer` instance associated with the provided `root` and `binaryOp`. */ -export function initCache(root: TgpuRoot, binaryOp: BinaryOp): PrefixScanComputer { +export function createPrefixScanComputer(root: TgpuRoot, binaryOp: BinaryOp): PrefixScanComputer { let rootCache = cache.get(root); if (!rootCache) { rootCache = new WeakMap(); @@ -332,5 +332,3 @@ export function initCache(root: TgpuRoot, binaryOp: BinaryOp): PrefixScanCompute } return computer; } - -export type { BinaryOp }; diff --git a/packages/typegpu-concurrent-scan/src/schemas.ts b/packages/typegpu-sort/src/scan/schemas.ts similarity index 77% rename from packages/typegpu-concurrent-scan/src/schemas.ts rename to packages/typegpu-sort/src/scan/schemas.ts index d494aff584..4d0fa0ac07 100644 --- a/packages/typegpu-concurrent-scan/src/schemas.ts +++ b/packages/typegpu-sort/src/scan/schemas.ts @@ -1,11 +1,6 @@ -import tgpu from 'typegpu'; -import * as d from 'typegpu/data'; +import tgpu, { d } from 'typegpu'; export const WORKGROUP_SIZE = 256; -export interface BinaryOp { - operation: (a: number, b: number) => number; - identityElement: number; -} export const scanLayout = tgpu.bindGroupLayout({ input: { storage: d.arrayOf(d.f32), access: 'mutable' }, @@ -16,6 +11,7 @@ export const uniformOpLayout = tgpu.bindGroupLayout({ input: { storage: d.arrayOf(d.f32), access: 'mutable' }, sums: { storage: d.arrayOf(d.f32), access: 'readonly' }, }); + export const operatorSlot = tgpu.slot<(a: number, b: number) => number>(); export const identitySlot = tgpu['~unstable'].accessor(d.f32); export const onlyGreatestElementSlot = tgpu.slot(); diff --git a/packages/typegpu-sort/src/scan/types.ts b/packages/typegpu-sort/src/scan/types.ts new file mode 100644 index 0000000000..0c3183e5e2 --- /dev/null +++ b/packages/typegpu-sort/src/scan/types.ts @@ -0,0 +1,4 @@ +export interface BinaryOp { + operation: (a: number, b: number) => number; + identityElement: number; +} diff --git a/packages/typegpu-concurrent-scan/tsconfig.json b/packages/typegpu-sort/tsconfig.json similarity index 100% rename from packages/typegpu-concurrent-scan/tsconfig.json rename to packages/typegpu-sort/tsconfig.json diff --git a/packages/typegpu/tests/examples/individual/bitonic-sort.test.ts b/packages/typegpu/tests/examples/individual/bitonic-sort.test.ts new file mode 100644 index 0000000000..a595cb1cdb --- /dev/null +++ b/packages/typegpu/tests/examples/individual/bitonic-sort.test.ts @@ -0,0 +1,164 @@ +/** + * @vitest-environment jsdom + */ + +import { describe, expect } from 'vitest'; +import { it } from '../../utils/extendedIt.ts'; +import { runExampleTest, setupCommonMocks } from '../utils/baseTest.ts'; + +describe('bitonic sort example', () => { + setupCommonMocks(); + + it('should produce valid code', async ({ device }) => { + const shaderCodes = await runExampleTest( + { + category: 'algorithms', + name: 'bitonic-sort', + controlTriggers: ['Sort'], + expectedCalls: 4, + }, + device, + ); + + expect(shaderCodes).toMatchInlineSnapshot(` + "struct copyParamsType { + srcLength: u32, + dstLength: u32, + paddingValue: u32, + } + + @group(0) @binding(2) var params: copyParamsType; + + @group(0) @binding(1) var dst: array; + + @group(0) @binding(0) var src: array; + + struct copyPadKernel_Input { + @builtin(global_invocation_id) gid: vec3u, + @builtin(num_workgroups) numWorkgroups: vec3u, + } + + @compute @workgroup_size(256) fn copyPadKernel(input: copyPadKernel_Input) { + let spanX = (input.numWorkgroups.x * 256u); + let spanY = (input.numWorkgroups.y * spanX); + let idx = ((input.gid.x + (input.gid.y * spanX)) + (input.gid.z * spanY)); + let dstLength = params.dstLength; + let srcLength = params.srcLength; + if ((idx >= dstLength)) { + return; + } + dst[idx] = select(params.paddingValue, src[idx], (idx < srcLength)); + } + + struct sortUniformsType { + k: u32, + jShift: u32, + } + + @group(0) @binding(1) var uniforms: sortUniformsType; + + @group(0) @binding(0) var data: array; + + fn defaultCompare(a: u32, b: u32) -> bool { + return (a < b); + } + + struct bitonicStepKernel_Input { + @builtin(global_invocation_id) gid: vec3u, + @builtin(num_workgroups) numWorkgroups: vec3u, + } + + @compute @workgroup_size(256) fn bitonicStepKernel(input: bitonicStepKernel_Input) { + let spanX = (input.numWorkgroups.x * 256u); + let spanY = (input.numWorkgroups.y * spanX); + let tid = ((input.gid.x + (input.gid.y * spanX)) + (input.gid.z * spanY)); + let k = uniforms.k; + let shift = uniforms.jShift; + let dataLength = arrayLength(&data); + let stride = (1u << shift); + let maskBelow = (stride - 1u); + let below = (tid & maskBelow); + let above = (tid >> shift); + let i = (below + (above * (stride << 1u))); + let ixj = (i + stride); + if ((ixj >= dataLength)) { + return; + } + let ascending = ((i & k) == 0u); + let left = data[i]; + let right = data[ixj]; + let leftFirst = defaultCompare(left, right); + let shouldSwap = select(leftFirst, !leftFirst, ascending); + if (shouldSwap) { + data[i] = right; + data[ixj] = left; + } + } + + struct copyParamsType { + srcLength: u32, + dstLength: u32, + paddingValue: u32, + } + + @group(0) @binding(2) var params: copyParamsType; + + @group(0) @binding(1) var dst: array; + + @group(0) @binding(0) var src: array; + + struct copyBackKernel_Input { + @builtin(global_invocation_id) gid: vec3u, + @builtin(num_workgroups) numWorkgroups: vec3u, + } + + @compute @workgroup_size(256) fn copyBackKernel(input: copyBackKernel_Input) { + let spanX = (input.numWorkgroups.x * 256u); + let spanY = (input.numWorkgroups.y * spanX); + let idx = ((input.gid.x + (input.gid.y * spanX)) + (input.gid.z * spanY)); + if ((idx < params.srcLength)) { + dst[idx] = src[idx]; + } + } + + struct fullScreenTriangle_Input { + @builtin(vertex_index) vertexIndex: u32, + } + + struct fullScreenTriangle_Output { + @builtin(position) pos: vec4f, + @location(0) uv: vec2f, + } + + @vertex fn fullScreenTriangle(in: fullScreenTriangle_Input) -> fullScreenTriangle_Output { + const pos = array(vec2f(-1, -1), vec2f(3, -1), vec2f(-1, 3)); + const uv = array(vec2f(0, 1), vec2f(2, 1), vec2f(0, -1)); + + return fullScreenTriangle_Output(vec4f(pos[in.vertexIndex], 0, 1), uv[in.vertexIndex]); + } + + @group(0) @binding(0) var initLength: u32; + + @group(1) @binding(0) var data: array; + + struct fragmentFn_Input { + @location(0) uv: vec2f, + } + + @fragment fn fragmentFn(input: fragmentFn_Input) -> @location(0) vec4f { + let arrayLength_1 = initLength; + let cols = u32(round(sqrt(f32(arrayLength_1)))); + let rows = u32(round((f32(arrayLength_1) / f32(cols)))); + let col = u32(floor((input.uv.x * f32(cols)))); + let row = u32(floor((input.uv.y * f32(rows)))); + let idx = ((row * cols) + col); + if ((idx >= arrayLength_1)) { + return vec4f(0.10000000149011612, 0.10000000149011612, 0.10000000149011612, 1); + } + let value = data[idx]; + let normalized = (f32(value) / 255f); + return vec4f(normalized, normalized, normalized, 1f); + }" + `); + }); +}); diff --git a/packages/typegpu/tests/utils/extendedIt.ts b/packages/typegpu/tests/utils/extendedIt.ts index a9cb8f3b22..b1630349a0 100644 --- a/packages/typegpu/tests/utils/extendedIt.ts +++ b/packages/typegpu/tests/utils/extendedIt.ts @@ -9,6 +9,7 @@ const adapterMock = { requestDevice: vi.fn((descriptor) => Promise.resolve(mockDevice)), limits: { maxStorageBufferBindingSize: 64 * 1024 * 1024, + maxBufferSize: 64 * 1024 * 1024, }, }; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 292bed1c87..f4492706f3 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -212,9 +212,6 @@ importers: '@typegpu/color': specifier: workspace:* version: link:../../packages/typegpu-color - '@typegpu/concurrent-scan': - specifier: workspace:* - version: link:../../packages/typegpu-concurrent-scan '@typegpu/geometry': specifier: workspace:* version: link:../../packages/typegpu-geometry @@ -224,6 +221,9 @@ importers: '@typegpu/sdf': specifier: workspace:* version: link:../../packages/typegpu-sdf + '@typegpu/sort': + specifier: workspace:* + version: link:../../packages/typegpu-sort '@typegpu/three': specifier: workspace:* version: link:../../packages/typegpu-three @@ -558,7 +558,7 @@ importers: version: link:../unplugin-typegpu publishDirectory: dist - packages/typegpu-concurrent-scan: + packages/typegpu-geometry: devDependencies: '@typegpu/tgpu-dev-cli': specifier: workspace:* @@ -580,7 +580,7 @@ importers: version: link:../unplugin-typegpu publishDirectory: dist - packages/typegpu-geometry: + packages/typegpu-noise: devDependencies: '@typegpu/tgpu-dev-cli': specifier: workspace:* @@ -602,7 +602,7 @@ importers: version: link:../unplugin-typegpu publishDirectory: dist - packages/typegpu-noise: + packages/typegpu-sdf: devDependencies: '@typegpu/tgpu-dev-cli': specifier: workspace:* @@ -624,7 +624,7 @@ importers: version: link:../unplugin-typegpu publishDirectory: dist - packages/typegpu-sdf: + packages/typegpu-sort: devDependencies: '@typegpu/tgpu-dev-cli': specifier: workspace:*