From b00369931cebcdc0ab67b841a9faef157d1f6754 Mon Sep 17 00:00:00 2001 From: Lukas Herzberger Date: Wed, 24 Apr 2024 17:13:54 +0200 Subject: [PATCH] add support for f16 --- README.md | 13 +++-- dist/index.d.ts | 27 ++++++++- dist/index.js | 122 +++++++++++++++++++++++++-------------- misc/index.html | 8 ++- src/index.ts | 149 +++++++++++++++++++++++++++++++++--------------- 5 files changed, 221 insertions(+), 98 deletions(-) diff --git a/README.md b/README.md index aed8492..2284e27 100644 --- a/README.md +++ b/README.md @@ -128,21 +128,21 @@ minMaxDepthPass.encode(computePassEncoder); #### Prepare pipelines for expected formats -In the above examples, GPU resources, like compute pipelines and bind group layouts etc., are created on the fly the first time a new configuration of `GPUDevice`, `GPUTextureFormat`, and filter is needed. +In the above examples, GPU resources, like compute pipelines and bind group layouts etc., are created on the fly the first time a new configuration of `GPUDevice`, `GPUTextureFormat`, filter, and precision is needed. WebGPU SPD also supports allocating resources during setup, like this: ```js -import { WebGPUSinglePassDownsampler, SPDFilters } from 'webgpu-spd'; +import { WebGPUSinglePassDownsampler, SPDFilters, SPDPrecision } from 'webgpu-spd'; const downsampler = new WebGPUSinglePassDownsampler({ device, formats: [ - { format: 'rgba8unorm' }, + { format: 'rgba8unorm', precision: SPDPrecision.F16 }, { format: 'r32float', filters: [ SPDFilters.Min ] }, ]}); // alternatively call downsampler.prepareDeviceResources({ device, formats: [ - { format: 'rgba8unorm' }, + { format: 'rgba8unorm', precision: SPDPrecision.F16 }, { format: 'r32float', filters: [ SPDFilters.Min ] }, ]}); ``` @@ -185,7 +185,8 @@ Custom filters for downsampling a quad to a single pixel can be registered with The given WGSL code must at least define a reduction function with the following name and signature: ```wgsl -fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 +// SPDFloat is an alias for either f32 or f16, depending on the configuration +fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 ``` For example, a custom filter that only takes a single pixel value out of the four given ones could be implemented and used like this: @@ -195,7 +196,7 @@ import { WebGPUSinglePassDownsampler } from 'webgpu-spd'; const downsampler = new WebGPUSinglePassDownsampler(); downsampler.registerFilter('upperLeft', ` - fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { + fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { return v0; } `); diff --git a/dist/index.d.ts b/dist/index.d.ts index c73be2a..decd5d8 100644 --- a/dist/index.d.ts +++ b/dist/index.d.ts @@ -48,6 +48,19 @@ export declare class SPDPass { */ encode(computePassEncoder: GPUComputePassEncoder): GPUComputePassEncoder; } +/** + * Float precision supported by WebGPU SPD. + */ +export declare enum SPDPrecision { + /** + * Full precision (32-bit) floats. + */ + F32 = "f32", + /** + * Half precision (16-bit) floats. + */ + F16 = "f16" +} /** * Configuration for {@link WebGPUSinglePassDownsampler.preparePass}. */ @@ -82,6 +95,12 @@ export interface SPDPassConfig { * Defaults to target.mipLevelCount. */ numMips?: number; + /** + * The float precision to use for downsampling. + * Falls back to {@link SPDPrecision.F32}, if {@link SPDPrecision.F16} is requested but not supported by the device (feature 'shader-f16' not enabled). + * Defaults to {@link SPDPrecision.F32}. + */ + precision?: SPDPrecision; } export interface SPDPrepareFormatDescriptor { /** @@ -93,6 +112,12 @@ export interface SPDPrepareFormatDescriptor { * Defaults to {@link SPDFilters.Average}. */ filters?: Set; + /** + * The float precision to use for this combination of texture format and filters. + * Falls back to {@link SPDPrecision.F32}, if {@link SPDPrecision.F16} is requested but not supported by the device (feature 'shader-f16' not enabled). + * Defaults to {@link SPDPrecision.F32}. + */ + precision?: SPDPrecision; } export interface SPDPrepareDeviceDescriptor { /** @@ -157,7 +182,7 @@ export declare class WebGPUSinglePassDownsampler { * * The given WGSL code must (at least) specify a function to reduce four values into one with the following name and signature: * - * spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 + * spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 * * @param name The unique name of the filter operation * @param wgsl The WGSL code to inject into the downsampling shader as the filter operation diff --git a/dist/index.js b/dist/index.js index 7b615ac..4e8c06f 100644 --- a/dist/index.js +++ b/dist/index.js @@ -1,4 +1,4 @@ -function makeShaderCode(outputFormat, filterOp = SPD_FILTER_AVERAGE, numMips) { +function makeShaderCode(outputFormat, filterOp = SPD_FILTER_AVERAGE, numMips, halfPrecision = false) { const mipsBindings = Array(numMips).fill(0) .map((_, i) => `@group(0) @binding(${i + 1}) var dst_mip_${i + 1}: texture_storage_2d_array<${outputFormat}, write>;`) .join('\n'); @@ -7,16 +7,17 @@ function makeShaderCode(outputFormat, filterOp = SPD_FILTER_AVERAGE, numMips) { .map((_, i) => { if (i == 5 && numMips > 6) { return ` else if mip == 6 { - textureStore(dst_mip_6, uv, slice, value); - mip_dst_6_buffer[slice][uv.y][uv.x] = value; + let val32 = vec4(value); + textureStore(dst_mip_6, uv, slice, val32); + mip_dst_6_buffer[slice][uv.y][uv.x] = val32; }`; } return `${i === 0 ? '' : ' else '}if mip == ${i + 1} { - textureStore(dst_mip_${i + 1}, uv, slice, value); + textureStore(dst_mip_${i + 1}, uv, slice, vec4(value)); }`; }) .join(''); - const mipsAccessor = `fn store_dst_mip(value: vec4, uv: vec2, slice: u32, mip: u32) {\n${mipsAccessorBody}\n}`; + const mipsAccessor = `fn store_dst_mip(value: vec4, uv: vec2, slice: u32, mip: u32) {\n${mipsAccessorBody}\n}`; const midMipAccessor = `return mip_dst_6_buffer[slice][uv.y][uv.x];`; return /* wgsl */ ` // This file is part of the FidelityFX SDK. @@ -42,6 +43,11 @@ function makeShaderCode(outputFormat, filterOp = SPD_FILTER_AVERAGE, numMips) { // THE SOFTWARE. +// Definitions -------------------------------------------------------------------------------------------------------- + +${halfPrecision ? 'enable f16;' : ''} +alias SPDFloat = ${halfPrecision ? 'f16' : 'f32'}; + // Helpers ------------------------------------------------------------------------------------------------------------ /** @@ -83,9 +89,9 @@ fn map_to_xy(local_invocation_index: u32) -> vec2 { * * @returns A value in SRGB space. */ -fn srgb_to_linear(value: f32) -> f32 { - let j = vec3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); - let k = vec2(1.055, -0.055); +fn srgb_to_linear(value: SPDFloat) -> SPDFloat { + let j = vec3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + let k = vec2(1.055, -0.055); return clamp(j.x, value * j.y, pow(value, j.z) * k.x + k.y); } @@ -117,19 +123,19 @@ fn get_work_group_offset() -> vec2 { return downsample_pass_meta.work_group_offset; } -fn load_src_image(uv: vec2, slice: u32) -> vec4 { - return textureLoad(src_mip_0, uv, slice, 0); +fn load_src_image(uv: vec2, slice: u32) -> vec4 { + return vec4(textureLoad(src_mip_0, uv, slice, 0)); } -fn load_mid_mip_image(uv: vec2, slice: u32) -> vec4 { - ${numMips > 6 ? midMipAccessor : 'return vec4();'} +fn load_mid_mip_image(uv: vec2, slice: u32) -> vec4 { + ${numMips > 6 ? midMipAccessor : 'return vec4();'} } ${mipsAccessor} // Workgroup ----------------------------------------------------------------------------------------------------------- -var spd_intermediate: array, 16>, 16>; +var spd_intermediate: array, 16>, 16>; var spd_counter: atomic; fn spd_increase_atomic_counter(slice: u32) { @@ -165,19 +171,19 @@ fn spd_exit_workgroup(num_work_groups: u32, local_invocation_index: u32, slice: ${filterOp} -fn spd_store(pix: vec2, out_value: vec4, mip: u32, slice: u32) { +fn spd_store(pix: vec2, out_value: vec4, mip: u32, slice: u32) { store_dst_mip(out_value, pix, slice, mip + 1); } -fn spd_load_intermediate(x: u32, y: u32) -> vec4 { +fn spd_load_intermediate(x: u32, y: u32) -> vec4 { return spd_intermediate[x][y]; } -fn spd_store_intermediate(x: u32, y: u32, value: vec4) { +fn spd_store_intermediate(x: u32, y: u32, value: vec4) { spd_intermediate[x][y] = value; } -fn spd_reduce_intermediate(i0: vec2, i1: vec2, i2: vec2, i3: vec2) -> vec4 { +fn spd_reduce_intermediate(i0: vec2, i1: vec2, i2: vec2, i3: vec2) -> vec4 { let v0 = spd_load_intermediate(i0.x, i0.y); let v1 = spd_load_intermediate(i1.x, i1.y); let v2 = spd_load_intermediate(i2.x, i2.y); @@ -185,7 +191,7 @@ fn spd_reduce_intermediate(i0: vec2, i1: vec2, i2: vec2, i3: vec2 return spd_reduce_4(v0, v1, v2, v3); } -fn spd_reduce_load_4(base: vec2, slice: u32) -> vec4 { +fn spd_reduce_load_4(base: vec2, slice: u32) -> vec4 { let v0 = load_src_image(base + vec2(0, 0), slice); let v1 = load_src_image(base + vec2(0, 1), slice); let v2 = load_src_image(base + vec2(1, 0), slice); @@ -193,7 +199,7 @@ fn spd_reduce_load_4(base: vec2, slice: u32) -> vec4 { return spd_reduce_4(v0, v1, v2, v3); } -fn spd_reduce_load_mid_mip_4(base: vec2, slice: u32) -> vec4 { +fn spd_reduce_load_mid_mip_4(base: vec2, slice: u32) -> vec4 { let v0 = load_mid_mip_image(base + vec2(0, 0), slice); let v1 = load_mid_mip_image(base + vec2(0, 1), slice); let v2 = load_mid_mip_image(base + vec2(1, 0), slice); @@ -204,7 +210,7 @@ fn spd_reduce_load_mid_mip_4(base: vec2, slice: u32) -> vec4 { // Main logic --------------------------------------------------------------------------------------------------------- fn spd_downsample_mips_0_1(x: u32, y: u32, workgroup_id: vec2, local_invocation_index: u32, mip: u32, slice: u32) { - var v: array, 4>; + var v: array, 4>; let workgroup64 = workgroup_id.xy * 64; let workgroup32 = workgroup_id.xy * 32; @@ -479,24 +485,24 @@ fn downsample(@builtin(local_invocation_index) local_invocation_index: u32, @bui `; } const SPD_FILTER_AVERAGE = /* wgsl */ ` -fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { +fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { return (v0 + v1 + v2 + v3) * 0.25; } `; const SPD_FILTER_MIN = /* wgsl */ ` -fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { +fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { return min(min(v0, v1), min(v2, v3)); } `; const SPD_FILTER_MAX = /* wgsl */ ` -fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { +fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { return max(max(v0, v1), max(v2, v3)); } `; const SPD_FILTER_MINMAX = /* wgsl */ ` -fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { +fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { let max4 = max(max(v0.xy, v1.xy), max(v2.xy, v3.xy)); - return vec4(min(min(v0.x, v1.x), min(v2.x, v3.x)), max(max4.x, max4.y), 0, 0); + return vec4(min(min(v0.x, v1.x), min(v2.x, v3.x)), max(max4.x, max4.y), 0, 0); } `; const SUPPORTED_FORMATS = new Set([ @@ -586,6 +592,20 @@ export class SPDPass { return computePassEncoder; } } +/** + * Float precision supported by WebGPU SPD. + */ +export var SPDPrecision; +(function (SPDPrecision) { + /** + * Full precision (32-bit) floats. + */ + SPDPrecision["F32"] = "f32"; + /** + * Half precision (16-bit) floats. + */ + SPDPrecision["F16"] = "f16"; +})(SPDPrecision || (SPDPrecision = {})); class SPDPipeline { mipsLayout; pipelines; @@ -659,16 +679,29 @@ class DevicePipelines { }); } } + sanitizePrecision(precision) { + const device = this.device.deref(); + if (!device) { + return precision; + } + else if (precision === SPDPrecision.F16 && !device.features.has('shader-f16')) { + console.warn(`[DevicePipelines::sanitizePrecision]: half precision requested but the device feature 'shader-f16' is not enabled, falling back to full precision`); + return SPDPrecision.F32; + } + else { + return precision; + } + } preparePipelines(pipelineConfigs) { - pipelineConfigs?.map(c => { + pipelineConfigs?.forEach(c => { Array.from(c.filters ?? [SPD_FILTER_AVERAGE]).map(f => { for (let i = 0; i < this.maxMipsPerPass; ++i) { - this.getOrCreatePipeline(c.format, f, i + 1); + this.getOrCreatePipeline(c.format, f, i + 1, c.precision ?? SPDPrecision.F32); } }); }); } - createPipeline(targetFormat, filterCode, numMips) { + createPipeline(targetFormat, filterCode, numMips, precision) { const device = this.device.deref(); if (!device) { return undefined; @@ -696,12 +729,11 @@ class DevicePipelines { return entry; }) }); - const module = device.createShaderModule({ - code: makeShaderCode(targetFormat, filterCode, Math.min(numMips, this.maxMipsPerPass)), - }); return new SPDPipeline(mipsBindGroupLayout, device.createComputePipeline({ compute: { - module, + module: device.createShaderModule({ + code: makeShaderCode(targetFormat, filterCode, Math.min(numMips, this.maxMipsPerPass), precision === SPDPrecision.F16), + }), entryPoint: 'downsample', }, layout: device.createPipelineLayout({ @@ -712,20 +744,24 @@ class DevicePipelines { }), })); } - getOrCreatePipeline(targetFormat, filterCode, numMipsToCreate) { + getOrCreatePipeline(targetFormat, filterCode, numMipsToCreate, precision) { + const sanitizedPrecision = this.sanitizePrecision(precision); if (!this.pipelines.has(targetFormat)) { this.pipelines.set(targetFormat, new Map()); } - if (!this.pipelines.get(targetFormat)?.has(filterCode)) { - this.pipelines.get(targetFormat)?.set(filterCode, new Map()); + if (!this.pipelines.get(targetFormat)?.has(sanitizedPrecision)) { + this.pipelines.get(targetFormat)?.set(sanitizedPrecision, new Map()); + } + if (!this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.has(filterCode)) { + this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.set(filterCode, new Map()); } - if (!this.pipelines.get(targetFormat)?.get(filterCode)?.has(numMipsToCreate)) { - const pipelines = this.createPipeline(targetFormat, filterCode, numMipsToCreate); + if (!this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.get(filterCode)?.has(numMipsToCreate)) { + const pipelines = this.createPipeline(targetFormat, filterCode, numMipsToCreate, sanitizedPrecision); if (pipelines) { - this.pipelines.get(targetFormat)?.get(filterCode)?.set(numMipsToCreate, pipelines); + this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.get(filterCode)?.set(numMipsToCreate, pipelines); } } - return this.pipelines.get(targetFormat)?.get(filterCode)?.get(numMipsToCreate); + return this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.get(filterCode)?.get(numMipsToCreate); } getOrCreateMidMipBuffer(device, numArrayLayers) { if (!this.midMipBuffers.has(numArrayLayers)) { @@ -783,7 +819,7 @@ class DevicePipelines { }); } } - preparePass(texture, target, filterCode, offset, size, numMipsTotal) { + preparePass(texture, target, filterCode, offset, size, numMipsTotal, precision) { const device = this.device.deref(); if (!device) { return undefined; @@ -805,7 +841,7 @@ class DevicePipelines { numArrayLayers: numArrayLayersThisPass, }); // todo: handle missing pipeline - const pipeline = this.getOrCreatePipeline(target.format, filterCode, numMipsThisPass); + const pipeline = this.getOrCreatePipeline(target.format, filterCode, numMipsThisPass, precision); const mipViews = Array(numMipsThisPass + 1).fill(0).map((_, i) => { if (baseMip === 0 && i === 0) { return texture.createView({ @@ -923,7 +959,7 @@ export class WebGPUSinglePassDownsampler { * * The given WGSL code must (at least) specify a function to reduce four values into one with the following name and signature: * - * spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 + * spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 * * @param name The unique name of the filter operation * @param wgsl The WGSL code to inject into the downsampling shader as the filter operation @@ -988,7 +1024,7 @@ export class WebGPUSinglePassDownsampler { console.warn(`[GPUSinglePassDownsampler::prepare]: filter ${filter} makes no sense for one-component target format ${target.format}`); } const filterCode = this.filters.get(filter) ?? SPD_FILTER_AVERAGE; - return this.getOrCreateDevicePipelines(device)?.preparePass(texture, target, filterCode, offset, size, numMips); + return this.getOrCreateDevicePipelines(device)?.preparePass(texture, target, filterCode, offset, size, numMips, config?.precision ?? SPDPrecision.F32); } /** * Generates mipmaps for the given texture. diff --git a/misc/index.html b/misc/index.html index 5d3ce8e..6c81b34 100644 --- a/misc/index.html +++ b/misc/index.html @@ -71,6 +71,10 @@
Misc
+
+ + +
@@ -147,7 +151,7 @@

Display options

async function main() { const adapter = await navigator.gpu.requestAdapter(); - const device = await adapter.requestDevice({requiredLimits: WebGPUSinglePassDownsampler.setPreferredLimits({}, adapter)}); + const device = await adapter.requestDevice({requiredLimits: WebGPUSinglePassDownsampler.setPreferredLimits({}, adapter), requiredFeatures: adapter.features.has('shader-f16') ? ['shader-f16'] : []}); const canvas = document.querySelector('canvas'); const context = canvas.getContext('webgpu'); @@ -243,6 +247,7 @@

Display options

const roiHeight = document.getElementById('roiHeight'); const intoTarget = document.getElementById('intoTarget'); + const halfPrecision = document.getElementById('halfPrecision'); const onNewTexture = texture => { if (filterRadio.filter(f => f.checked)[0].id === 'custom') { @@ -259,6 +264,7 @@

Display options

offset: [offsetX.value, offsetY.value], size: useRoiSize.checked ? [roiWidth.value, roiHeight.value] : [texture.width, texture.height], target, + precision: halfPrecision.checked ? 'f16' : 'f32', }; if (!downsampler.generateMipmaps(device, texture, config)) { console.warn(`could not downsample texture generated from ${textureUrl}`); diff --git a/src/index.ts b/src/index.ts index f9cb3a3..3b4e59b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,4 @@ -function makeShaderCode(outputFormat: string, filterOp: string = SPD_FILTER_AVERAGE, numMips: number): string { +function makeShaderCode(outputFormat: string, filterOp: string = SPD_FILTER_AVERAGE, numMips: number, halfPrecision: boolean = false): string { const mipsBindings = Array(numMips).fill(0) .map((_, i) => `@group(0) @binding(${i + 1}) var dst_mip_${i + 1}: texture_storage_2d_array<${outputFormat}, write>;`) .join('\n'); @@ -8,17 +8,18 @@ function makeShaderCode(outputFormat: string, filterOp: string = SPD_FILTER_AVER .map((_, i) => { if (i == 5 && numMips > 6) { return ` else if mip == 6 { - textureStore(dst_mip_6, uv, slice, value); - mip_dst_6_buffer[slice][uv.y][uv.x] = value; + let val32 = vec4(value); + textureStore(dst_mip_6, uv, slice, val32); + mip_dst_6_buffer[slice][uv.y][uv.x] = val32; }` } return `${i === 0 ? '' : ' else '}if mip == ${i + 1} { - textureStore(dst_mip_${i + 1}, uv, slice, value); + textureStore(dst_mip_${i + 1}, uv, slice, vec4(value)); }`; }) .join(''); - const mipsAccessor = `fn store_dst_mip(value: vec4, uv: vec2, slice: u32, mip: u32) {\n${mipsAccessorBody}\n}` + const mipsAccessor = `fn store_dst_mip(value: vec4, uv: vec2, slice: u32, mip: u32) {\n${mipsAccessorBody}\n}` const midMipAccessor =`return mip_dst_6_buffer[slice][uv.y][uv.x];`; return /* wgsl */` @@ -45,6 +46,11 @@ function makeShaderCode(outputFormat: string, filterOp: string = SPD_FILTER_AVER // THE SOFTWARE. +// Definitions -------------------------------------------------------------------------------------------------------- + +${halfPrecision ? 'enable f16;' : ''} +alias SPDFloat = ${halfPrecision ? 'f16' : 'f32'}; + // Helpers ------------------------------------------------------------------------------------------------------------ /** @@ -86,9 +92,9 @@ fn map_to_xy(local_invocation_index: u32) -> vec2 { * * @returns A value in SRGB space. */ -fn srgb_to_linear(value: f32) -> f32 { - let j = vec3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); - let k = vec2(1.055, -0.055); +fn srgb_to_linear(value: SPDFloat) -> SPDFloat { + let j = vec3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + let k = vec2(1.055, -0.055); return clamp(j.x, value * j.y, pow(value, j.z) * k.x + k.y); } @@ -120,19 +126,19 @@ fn get_work_group_offset() -> vec2 { return downsample_pass_meta.work_group_offset; } -fn load_src_image(uv: vec2, slice: u32) -> vec4 { - return textureLoad(src_mip_0, uv, slice, 0); +fn load_src_image(uv: vec2, slice: u32) -> vec4 { + return vec4(textureLoad(src_mip_0, uv, slice, 0)); } -fn load_mid_mip_image(uv: vec2, slice: u32) -> vec4 { - ${numMips > 6 ? midMipAccessor : 'return vec4();'} +fn load_mid_mip_image(uv: vec2, slice: u32) -> vec4 { + ${numMips > 6 ? midMipAccessor : 'return vec4();'} } ${mipsAccessor} // Workgroup ----------------------------------------------------------------------------------------------------------- -var spd_intermediate: array, 16>, 16>; +var spd_intermediate: array, 16>, 16>; var spd_counter: atomic; fn spd_increase_atomic_counter(slice: u32) { @@ -168,19 +174,19 @@ fn spd_exit_workgroup(num_work_groups: u32, local_invocation_index: u32, slice: ${filterOp} -fn spd_store(pix: vec2, out_value: vec4, mip: u32, slice: u32) { +fn spd_store(pix: vec2, out_value: vec4, mip: u32, slice: u32) { store_dst_mip(out_value, pix, slice, mip + 1); } -fn spd_load_intermediate(x: u32, y: u32) -> vec4 { +fn spd_load_intermediate(x: u32, y: u32) -> vec4 { return spd_intermediate[x][y]; } -fn spd_store_intermediate(x: u32, y: u32, value: vec4) { +fn spd_store_intermediate(x: u32, y: u32, value: vec4) { spd_intermediate[x][y] = value; } -fn spd_reduce_intermediate(i0: vec2, i1: vec2, i2: vec2, i3: vec2) -> vec4 { +fn spd_reduce_intermediate(i0: vec2, i1: vec2, i2: vec2, i3: vec2) -> vec4 { let v0 = spd_load_intermediate(i0.x, i0.y); let v1 = spd_load_intermediate(i1.x, i1.y); let v2 = spd_load_intermediate(i2.x, i2.y); @@ -188,7 +194,7 @@ fn spd_reduce_intermediate(i0: vec2, i1: vec2, i2: vec2, i3: vec2 return spd_reduce_4(v0, v1, v2, v3); } -fn spd_reduce_load_4(base: vec2, slice: u32) -> vec4 { +fn spd_reduce_load_4(base: vec2, slice: u32) -> vec4 { let v0 = load_src_image(base + vec2(0, 0), slice); let v1 = load_src_image(base + vec2(0, 1), slice); let v2 = load_src_image(base + vec2(1, 0), slice); @@ -196,7 +202,7 @@ fn spd_reduce_load_4(base: vec2, slice: u32) -> vec4 { return spd_reduce_4(v0, v1, v2, v3); } -fn spd_reduce_load_mid_mip_4(base: vec2, slice: u32) -> vec4 { +fn spd_reduce_load_mid_mip_4(base: vec2, slice: u32) -> vec4 { let v0 = load_mid_mip_image(base + vec2(0, 0), slice); let v1 = load_mid_mip_image(base + vec2(0, 1), slice); let v2 = load_mid_mip_image(base + vec2(1, 0), slice); @@ -207,7 +213,7 @@ fn spd_reduce_load_mid_mip_4(base: vec2, slice: u32) -> vec4 { // Main logic --------------------------------------------------------------------------------------------------------- fn spd_downsample_mips_0_1(x: u32, y: u32, workgroup_id: vec2, local_invocation_index: u32, mip: u32, slice: u32) { - var v: array, 4>; + var v: array, 4>; let workgroup64 = workgroup_id.xy * 64; let workgroup32 = workgroup_id.xy * 32; @@ -483,27 +489,27 @@ fn downsample(@builtin(local_invocation_index) local_invocation_index: u32, @bui } const SPD_FILTER_AVERAGE: string = /* wgsl */` -fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { +fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { return (v0 + v1 + v2 + v3) * 0.25; } `; const SPD_FILTER_MIN = /* wgsl */` -fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { +fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { return min(min(v0, v1), min(v2, v3)); } `; const SPD_FILTER_MAX = /* wgsl */` -fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { +fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { return max(max(v0, v1), max(v2, v3)); } `; const SPD_FILTER_MINMAX = /* wgsl */` -fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { +fn spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 { let max4 = max(max(v0.xy, v1.xy), max(v2.xy, v3.xy)); - return vec4(min(min(v0.x, v1.x), min(v2.x, v3.x)), max(max4.x, max4.y), 0, 0); + return vec4(min(min(v0.x, v1.x), min(v2.x, v3.x)), max(max4.x, max4.y), 0, 0); } `; @@ -590,6 +596,28 @@ export class SPDPass { computePassEncoder.setBindGroup(1, null); return computePassEncoder; } + + /** + * Returns the number of passes that will be encoded by calling this instance's {@link SPDPass.encode} method. + */ + get numPasses(): number { + return this.passes.length + } +} + +/** + * Float precision supported by WebGPU SPD. + */ +export enum SPDPrecision { + /** + * Full precision (32-bit) floats. + */ + F32 = 'f32', + + /** + * Half precision (16-bit) floats. + */ + F16 = 'f16', } /** @@ -630,6 +658,13 @@ export interface SPDPassConfig { * Defaults to target.mipLevelCount. */ numMips?: number, + + /** + * The float precision to use for downsampling. + * Falls back to {@link SPDPrecision.F32}, if {@link SPDPrecision.F16} is requested but not supported by the device (feature 'shader-f16' not enabled). + * Defaults to {@link SPDPrecision.F32}. + */ + precision?: SPDPrecision; } interface GPUDownsamplingMeta { @@ -654,6 +689,13 @@ export interface SPDPrepareFormatDescriptor { * Defaults to {@link SPDFilters.Average}. */ filters?: Set, + + /** + * The float precision to use for this combination of texture format and filters. + * Falls back to {@link SPDPrecision.F32}, if {@link SPDPrecision.F16} is requested but not supported by the device (feature 'shader-f16' not enabled). + * Defaults to {@link SPDPrecision.F32}. + */ + precision?: SPDPrecision, } export interface SPDPrepareDeviceDescriptor { @@ -683,7 +725,7 @@ class DevicePipelines { private internalResourcesBindGroupLayout12?: GPUBindGroupLayout; private atomicCounters?: GPUBuffer; private midMipBuffers: Map; - private pipelines: Map>>; + private pipelines: Map>>>; constructor(device: GPUDevice, maxArrayLayers?: number) { this.device = new WeakRef(device); @@ -744,18 +786,29 @@ class DevicePipelines { } } + private sanitizePrecision(precision: SPDPrecision): SPDPrecision { + const device = this.device.deref(); + if (!device) { + return precision; + } else if (precision === SPDPrecision.F16 && !device.features.has('shader-f16')) { + console.warn(`[DevicePipelines::sanitizePrecision]: half precision requested but the device feature 'shader-f16' is not enabled, falling back to full precision`); + return SPDPrecision.F32; + } else { + return precision; + } + } + preparePipelines(pipelineConfigs?: Array) { - pipelineConfigs?.map(c => { + pipelineConfigs?.forEach(c => { Array.from(c.filters ?? [SPD_FILTER_AVERAGE]).map(f => { for (let i = 0; i < this.maxMipsPerPass; ++i) { - this.getOrCreatePipeline(c.format, f, i + 1); + this.getOrCreatePipeline(c.format, f, i + 1, c.precision ?? SPDPrecision.F32); } - }) - + }); }); } - private createPipeline(targetFormat: GPUTextureFormat, filterCode: string, numMips: number): SPDPipeline | undefined { + private createPipeline(targetFormat: GPUTextureFormat, filterCode: string, numMips: number, precision: SPDPrecision): SPDPipeline | undefined { const device = this.device.deref(); if (!device) { return undefined; @@ -783,15 +836,13 @@ class DevicePipelines { }) }); - const module = device.createShaderModule({ - code: makeShaderCode(targetFormat, filterCode, Math.min(numMips, this.maxMipsPerPass)), - }); - return new SPDPipeline( mipsBindGroupLayout, device.createComputePipeline({ compute: { - module, + module: device.createShaderModule({ + code: makeShaderCode(targetFormat, filterCode, Math.min(numMips, this.maxMipsPerPass), precision === SPDPrecision.F16), + }), entryPoint: 'downsample', }, layout: device.createPipelineLayout({ @@ -804,20 +855,24 @@ class DevicePipelines { ); } - private getOrCreatePipeline(targetFormat: GPUTextureFormat, filterCode: string, numMipsToCreate: number): SPDPipeline | undefined { + private getOrCreatePipeline(targetFormat: GPUTextureFormat, filterCode: string, numMipsToCreate: number, precision: SPDPrecision): SPDPipeline | undefined { + const sanitizedPrecision = this.sanitizePrecision(precision); if (!this.pipelines.has(targetFormat)) { this.pipelines.set(targetFormat, new Map()); } - if (!this.pipelines.get(targetFormat)?.has(filterCode)) { - this.pipelines.get(targetFormat)?.set(filterCode, new Map()); + if (!this.pipelines.get(targetFormat)?.has(sanitizedPrecision)) { + this.pipelines.get(targetFormat)?.set(sanitizedPrecision, new Map()); + } + if (!this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.has(filterCode)) { + this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.set(filterCode, new Map()); } - if (!this.pipelines.get(targetFormat)?.get(filterCode)?.has(numMipsToCreate)) { - const pipelines = this.createPipeline(targetFormat, filterCode, numMipsToCreate); + if (!this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.get(filterCode)?.has(numMipsToCreate)) { + const pipelines = this.createPipeline(targetFormat, filterCode, numMipsToCreate, sanitizedPrecision); if (pipelines) { - this.pipelines.get(targetFormat)?.get(filterCode)?.set(numMipsToCreate, pipelines); + this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.get(filterCode)?.set(numMipsToCreate, pipelines); } } - return this.pipelines.get(targetFormat)?.get(filterCode)?.get(numMipsToCreate); + return this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.get(filterCode)?.get(numMipsToCreate); } private getOrCreateMidMipBuffer(device: GPUDevice, numArrayLayers: number): GPUBuffer { @@ -877,7 +932,7 @@ class DevicePipelines { } } - preparePass(texture: GPUTexture, target: GPUTexture, filterCode: string, offset: [number, number], size: [number, number], numMipsTotal: number): SPDPass | undefined { + preparePass(texture: GPUTexture, target: GPUTexture, filterCode: string, offset: [number, number], size: [number, number], numMipsTotal: number, precision: SPDPrecision): SPDPass | undefined { const device = this.device.deref(); if (!device) { return undefined; @@ -903,7 +958,7 @@ class DevicePipelines { }); // todo: handle missing pipeline - const pipeline = this.getOrCreatePipeline(target.format, filterCode, numMipsThisPass)!; + const pipeline = this.getOrCreatePipeline(target.format, filterCode, numMipsThisPass, precision)!; const mipViews = Array(numMipsThisPass + 1).fill(0).map((_, i) => { if (baseMip === 0 && i === 0) { @@ -1031,7 +1086,7 @@ export class WebGPUSinglePassDownsampler { * * The given WGSL code must (at least) specify a function to reduce four values into one with the following name and signature: * - * spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 + * spd_reduce_4(v0: vec4, v1: vec4, v2: vec4, v3: vec4) -> vec4 * * @param name The unique name of the filter operation * @param wgsl The WGSL code to inject into the downsampling shader as the filter operation @@ -1099,7 +1154,7 @@ export class WebGPUSinglePassDownsampler { } const filterCode = this.filters.get(filter) ?? SPD_FILTER_AVERAGE; - return this.getOrCreateDevicePipelines(device)?.preparePass(texture, target, filterCode, offset, size, numMips); + return this.getOrCreateDevicePipelines(device)?.preparePass(texture, target, filterCode, offset, size, numMips, config?.precision ?? SPDPrecision.F32); } /**