add support for f16

JolifantoBambla · Apr 24, 2024 · b003699 · b003699
1 parent 6d496ed
commit b003699
Show file tree

Hide file tree

Showing 5 changed files with 221 additions and 98 deletions.
diff --git a/README.md b/README.md
@@ -128,21 +128,21 @@ minMaxDepthPass.encode(computePassEncoder);
 
 #### Prepare pipelines for expected formats
 
-In the above examples, GPU resources, like compute pipelines and bind group layouts etc., are created on the fly the first time a new configuration of `GPUDevice`, `GPUTextureFormat`, and filter is needed.
+In the above examples, GPU resources, like compute pipelines and bind group layouts etc., are created on the fly the first time a new configuration of `GPUDevice`, `GPUTextureFormat`, filter, and precision is needed.
 
 WebGPU SPD also supports allocating resources during setup, like this:
 
 ```js
-import { WebGPUSinglePassDownsampler, SPDFilters } from 'webgpu-spd';
+import { WebGPUSinglePassDownsampler, SPDFilters, SPDPrecision } from 'webgpu-spd';
 
 const downsampler = new WebGPUSinglePassDownsampler({ device, formats: [
-    { format: 'rgba8unorm' },
+    { format: 'rgba8unorm', precision: SPDPrecision.F16 },
     { format: 'r32float', filters: [ SPDFilters.Min ] },
 ]});
 
 // alternatively call
 downsampler.prepareDeviceResources({ device, formats: [
-    { format: 'rgba8unorm' },
+    { format: 'rgba8unorm', precision: SPDPrecision.F16 },
     { format: 'r32float', filters: [ SPDFilters.Min ] },
 ]});
 ```
@@ -185,7 +185,8 @@ Custom filters for downsampling a quad to a single pixel can be registered with
 The given WGSL code must at least define a reduction function with the following name and signature:
 
 ```wgsl
-fn spd_reduce_4(v0: vec4<f32>, v1: vec4<f32>, v2: vec4<f32>, v3: vec4<f32>) -> vec4<f32>
+// SPDFloat is an alias for either f32 or f16, depending on the configuration
+fn spd_reduce_4(v0: vec4<SPDFloat>, v1: vec4<SPDFloat>, v2: vec4<SPDFloat>, v3: vec4<SPDFloat>) -> vec4<SPDFloat>
 ```
 
 For example, a custom filter that only takes a single pixel value out of the four given ones could be implemented and used like this:
@@ -195,7 +196,7 @@ import { WebGPUSinglePassDownsampler } from 'webgpu-spd';
 
 const downsampler = new WebGPUSinglePassDownsampler();
 downsampler.registerFilter('upperLeft', `
-    fn spd_reduce_4(v0: vec4<f32>, v1: vec4<f32>, v2: vec4<f32>, v3: vec4<f32>) -> vec4<f32> {
+    fn spd_reduce_4(v0: vec4<SPDFloat>, v1: vec4<SPDFloat>, v2: vec4<SPDFloat>, v3: vec4<SPDFloat>) -> vec4<SPDFloat> {
         return v0;
     }
 `);

diff --git a/dist/index.d.ts b/dist/index.d.ts
@@ -48,6 +48,19 @@ export declare class SPDPass {
      */
     encode(computePassEncoder: GPUComputePassEncoder): GPUComputePassEncoder;
 }
+/**
+ * Float precision supported by WebGPU SPD.
+ */
+export declare enum SPDPrecision {
+    /**
+     * Full precision (32-bit) floats.
+     */
+    F32 = "f32",
+    /**
+     * Half precision (16-bit) floats.
+     */
+    F16 = "f16"
+}
 /**
  * Configuration for {@link WebGPUSinglePassDownsampler.preparePass}.
  */
@@ -82,6 +95,12 @@ export interface SPDPassConfig {
      * Defaults to target.mipLevelCount.
      */
     numMips?: number;
+    /**
+     * The float precision to use for downsampling.
+     * Falls back to {@link SPDPrecision.F32}, if {@link SPDPrecision.F16} is requested but not supported by the device (feature 'shader-f16' not enabled).
+     * Defaults to {@link SPDPrecision.F32}.
+     */
+    precision?: SPDPrecision;
 }
 export interface SPDPrepareFormatDescriptor {
     /**
@@ -93,6 +112,12 @@ export interface SPDPrepareFormatDescriptor {
      * Defaults to {@link SPDFilters.Average}.
      */
     filters?: Set<string>;
+    /**
+     * The float precision to use for this combination of texture format and filters.
+     * Falls back to {@link SPDPrecision.F32}, if {@link SPDPrecision.F16} is requested but not supported by the device (feature 'shader-f16' not enabled).
+     * Defaults to {@link SPDPrecision.F32}.
+     */
+    precision?: SPDPrecision;
 }
 export interface SPDPrepareDeviceDescriptor {
     /**
@@ -157,7 +182,7 @@ export declare class WebGPUSinglePassDownsampler {
      *
      * The given WGSL code must (at least) specify a function to reduce four values into one with the following name and signature:
      *
-     *   spd_reduce_4(v0: vec4<f32>, v1: vec4<f32>, v2: vec4<f32>, v3: vec4<f32>) -> vec4<f32>
+     *   spd_reduce_4(v0: vec4<SPDFloat>, v1: vec4<SPDFloat>, v2: vec4<SPDFloat>, v3: vec4<SPDFloat>) -> vec4<SPDFloat>
      *
      * @param name The unique name of the filter operation
      * @param wgsl The WGSL code to inject into the downsampling shader as the filter operation

diff --git a/dist/index.js b/dist/index.js
@@ -1,4 +1,4 @@
-function makeShaderCode(outputFormat, filterOp = SPD_FILTER_AVERAGE, numMips) {
+function makeShaderCode(outputFormat, filterOp = SPD_FILTER_AVERAGE, numMips, halfPrecision = false) {
     const mipsBindings = Array(numMips).fill(0)
         .map((_, i) => `@group(0) @binding(${i + 1}) var dst_mip_${i + 1}: texture_storage_2d_array<${outputFormat}, write>;`)
         .join('\n');
@@ -7,16 +7,17 @@ function makeShaderCode(outputFormat, filterOp = SPD_FILTER_AVERAGE, numMips) {
         .map((_, i) => {
         if (i == 5 && numMips > 6) {
             return ` else if mip == 6 {
-                    textureStore(dst_mip_6, uv, slice, value);
-                    mip_dst_6_buffer[slice][uv.y][uv.x] = value;
+                    let val32 = vec4<f32>(value);
+                    textureStore(dst_mip_6, uv, slice, val32);
+                    mip_dst_6_buffer[slice][uv.y][uv.x] = val32;
                 }`;
         }
         return `${i === 0 ? '' : ' else '}if mip == ${i + 1} {
-                textureStore(dst_mip_${i + 1}, uv, slice, value);
+                textureStore(dst_mip_${i + 1}, uv, slice, vec4<f32>(value));
             }`;
     })
         .join('');
-    const mipsAccessor = `fn store_dst_mip(value: vec4<f32>, uv: vec2<u32>, slice: u32, mip: u32) {\n${mipsAccessorBody}\n}`;
+    const mipsAccessor = `fn store_dst_mip(value: vec4<SPDFloat>, uv: vec2<u32>, slice: u32, mip: u32) {\n${mipsAccessorBody}\n}`;
     const midMipAccessor = `return mip_dst_6_buffer[slice][uv.y][uv.x];`;
     return /* wgsl */ `
     // This file is part of the FidelityFX SDK.
@@ -42,6 +43,11 @@ function makeShaderCode(outputFormat, filterOp = SPD_FILTER_AVERAGE, numMips) {
 // THE SOFTWARE.
 
 
+// Definitions --------------------------------------------------------------------------------------------------------
+
+${halfPrecision ? 'enable f16;' : ''}
+alias SPDFloat = ${halfPrecision ? 'f16' : 'f32'};
+
 // Helpers ------------------------------------------------------------------------------------------------------------
 
 /**
@@ -83,9 +89,9 @@ fn map_to_xy(local_invocation_index: u32) -> vec2<u32> {
  * 
  *  @returns A value in SRGB space.
  */
-fn srgb_to_linear(value: f32) -> f32 {
-    let j = vec3<f32>(0.0031308 * 12.92, 12.92, 1.0 / 2.4);
-    let k = vec2<f32>(1.055, -0.055);
+fn srgb_to_linear(value: SPDFloat) -> SPDFloat {
+    let j = vec3<SPDFloat>(0.0031308 * 12.92, 12.92, 1.0 / 2.4);
+    let k = vec2<SPDFloat>(1.055, -0.055);
     return clamp(j.x, value * j.y, pow(value, j.z) * k.x + k.y);
 }
 
@@ -117,19 +123,19 @@ fn get_work_group_offset() -> vec2<u32> {
     return downsample_pass_meta.work_group_offset;
 }
 
-fn load_src_image(uv: vec2<u32>, slice: u32) -> vec4<f32> {
-    return textureLoad(src_mip_0, uv, slice, 0);
+fn load_src_image(uv: vec2<u32>, slice: u32) -> vec4<SPDFloat> {
+    return vec4<SPDFloat>(textureLoad(src_mip_0, uv, slice, 0));
 }
 
-fn load_mid_mip_image(uv: vec2<u32>, slice: u32) -> vec4<f32> {
-    ${numMips > 6 ? midMipAccessor : 'return vec4<f32>();'}
+fn load_mid_mip_image(uv: vec2<u32>, slice: u32) -> vec4<SPDFloat> {
+    ${numMips > 6 ? midMipAccessor : 'return vec4<SPDFloat>();'}
 }
 
 ${mipsAccessor}
 
 // Workgroup -----------------------------------------------------------------------------------------------------------
 
-var<workgroup> spd_intermediate: array<array<vec4<f32>, 16>, 16>;
+var<workgroup> spd_intermediate: array<array<vec4<SPDFloat>, 16>, 16>;
 var<workgroup> spd_counter: atomic<u32>;
 
 fn spd_increase_atomic_counter(slice: u32) {
@@ -165,35 +171,35 @@ fn spd_exit_workgroup(num_work_groups: u32, local_invocation_index: u32, slice:
 
 ${filterOp}
 
-fn spd_store(pix: vec2<u32>, out_value: vec4<f32>, mip: u32, slice: u32) {
+fn spd_store(pix: vec2<u32>, out_value: vec4<SPDFloat>, mip: u32, slice: u32) {
     store_dst_mip(out_value, pix, slice, mip + 1);
 }
 
-fn spd_load_intermediate(x: u32, y: u32) -> vec4<f32> {
+fn spd_load_intermediate(x: u32, y: u32) -> vec4<SPDFloat> {
     return spd_intermediate[x][y];
 }
 
-fn spd_store_intermediate(x: u32, y: u32, value: vec4<f32>) {
+fn spd_store_intermediate(x: u32, y: u32, value: vec4<SPDFloat>) {
     spd_intermediate[x][y] = value;
 }
 
-fn spd_reduce_intermediate(i0: vec2<u32>, i1: vec2<u32>, i2: vec2<u32>, i3: vec2<u32>) -> vec4<f32> {
+fn spd_reduce_intermediate(i0: vec2<u32>, i1: vec2<u32>, i2: vec2<u32>, i3: vec2<u32>) -> vec4<SPDFloat> {
     let v0 = spd_load_intermediate(i0.x, i0.y);
     let v1 = spd_load_intermediate(i1.x, i1.y);
     let v2 = spd_load_intermediate(i2.x, i2.y);
     let v3 = spd_load_intermediate(i3.x, i3.y);
     return spd_reduce_4(v0, v1, v2, v3);
 }
 
-fn spd_reduce_load_4(base: vec2<u32>, slice: u32) -> vec4<f32> {
+fn spd_reduce_load_4(base: vec2<u32>, slice: u32) -> vec4<SPDFloat> {
     let v0 = load_src_image(base + vec2<u32>(0, 0), slice);
     let v1 = load_src_image(base + vec2<u32>(0, 1), slice);
     let v2 = load_src_image(base + vec2<u32>(1, 0), slice);
     let v3 = load_src_image(base + vec2<u32>(1, 1), slice);
     return spd_reduce_4(v0, v1, v2, v3);
 }
 
-fn spd_reduce_load_mid_mip_4(base: vec2<u32>, slice: u32) -> vec4<f32> {
+fn spd_reduce_load_mid_mip_4(base: vec2<u32>, slice: u32) -> vec4<SPDFloat> {
     let v0 = load_mid_mip_image(base + vec2<u32>(0, 0), slice);
     let v1 = load_mid_mip_image(base + vec2<u32>(0, 1), slice);
     let v2 = load_mid_mip_image(base + vec2<u32>(1, 0), slice);
@@ -204,7 +210,7 @@ fn spd_reduce_load_mid_mip_4(base: vec2<u32>, slice: u32) -> vec4<f32> {
 // Main logic ---------------------------------------------------------------------------------------------------------
 
 fn spd_downsample_mips_0_1(x: u32, y: u32, workgroup_id: vec2<u32>, local_invocation_index: u32, mip: u32, slice: u32) {
-    var v: array<vec4<f32>, 4>;
+    var v: array<vec4<SPDFloat>, 4>;
 
     let workgroup64 = workgroup_id.xy * 64;
     let workgroup32 = workgroup_id.xy * 32;
@@ -479,24 +485,24 @@ fn downsample(@builtin(local_invocation_index) local_invocation_index: u32, @bui
     `;
 }
 const SPD_FILTER_AVERAGE = /* wgsl */ `
-fn spd_reduce_4(v0: vec4<f32>, v1: vec4<f32>, v2: vec4<f32>, v3: vec4<f32>) -> vec4<f32> {
+fn spd_reduce_4(v0: vec4<SPDFloat>, v1: vec4<SPDFloat>, v2: vec4<SPDFloat>, v3: vec4<SPDFloat>) -> vec4<SPDFloat> {
     return (v0 + v1 + v2 + v3) * 0.25;
 }
 `;
 const SPD_FILTER_MIN = /* wgsl */ `
-fn spd_reduce_4(v0: vec4<f32>, v1: vec4<f32>, v2: vec4<f32>, v3: vec4<f32>) -> vec4<f32> {
+fn spd_reduce_4(v0: vec4<SPDFloat>, v1: vec4<SPDFloat>, v2: vec4<SPDFloat>, v3: vec4<SPDFloat>) -> vec4<SPDFloat> {
     return min(min(v0, v1), min(v2, v3));
 }
 `;
 const SPD_FILTER_MAX = /* wgsl */ `
-fn spd_reduce_4(v0: vec4<f32>, v1: vec4<f32>, v2: vec4<f32>, v3: vec4<f32>) -> vec4<f32> {
+fn spd_reduce_4(v0: vec4<SPDFloat>, v1: vec4<SPDFloat>, v2: vec4<SPDFloat>, v3: vec4<SPDFloat>) -> vec4<SPDFloat> {
     return max(max(v0, v1), max(v2, v3));
 }
 `;
 const SPD_FILTER_MINMAX = /* wgsl */ `
-fn spd_reduce_4(v0: vec4<f32>, v1: vec4<f32>, v2: vec4<f32>, v3: vec4<f32>) -> vec4<f32> {
+fn spd_reduce_4(v0: vec4<SPDFloat>, v1: vec4<SPDFloat>, v2: vec4<SPDFloat>, v3: vec4<SPDFloat>) -> vec4<SPDFloat> {
     let max4 = max(max(v0.xy, v1.xy), max(v2.xy, v3.xy));
-    return vec4<f32>(min(min(v0.x, v1.x), min(v2.x, v3.x)), max(max4.x, max4.y), 0, 0);
+    return vec4<SPDFloat>(min(min(v0.x, v1.x), min(v2.x, v3.x)), max(max4.x, max4.y), 0, 0);
 }
 `;
 const SUPPORTED_FORMATS = new Set([
@@ -586,6 +592,20 @@ export class SPDPass {
         return computePassEncoder;
     }
 }
+/**
+ * Float precision supported by WebGPU SPD.
+ */
+export var SPDPrecision;
+(function (SPDPrecision) {
+    /**
+     * Full precision (32-bit) floats.
+     */
+    SPDPrecision["F32"] = "f32";
+    /**
+     * Half precision (16-bit) floats.
+     */
+    SPDPrecision["F16"] = "f16";
+})(SPDPrecision || (SPDPrecision = {}));
 class SPDPipeline {
     mipsLayout;
     pipelines;
@@ -659,16 +679,29 @@ class DevicePipelines {
             });
         }
     }
+    sanitizePrecision(precision) {
+        const device = this.device.deref();
+        if (!device) {
+            return precision;
+        }
+        else if (precision === SPDPrecision.F16 && !device.features.has('shader-f16')) {
+            console.warn(`[DevicePipelines::sanitizePrecision]: half precision requested but the device feature 'shader-f16' is not enabled, falling back to full precision`);
+            return SPDPrecision.F32;
+        }
+        else {
+            return precision;
+        }
+    }
     preparePipelines(pipelineConfigs) {
-        pipelineConfigs?.map(c => {
+        pipelineConfigs?.forEach(c => {
             Array.from(c.filters ?? [SPD_FILTER_AVERAGE]).map(f => {
                 for (let i = 0; i < this.maxMipsPerPass; ++i) {
-                    this.getOrCreatePipeline(c.format, f, i + 1);
+                    this.getOrCreatePipeline(c.format, f, i + 1, c.precision ?? SPDPrecision.F32);
                 }
             });
         });
     }
-    createPipeline(targetFormat, filterCode, numMips) {
+    createPipeline(targetFormat, filterCode, numMips, precision) {
         const device = this.device.deref();
         if (!device) {
             return undefined;
@@ -696,12 +729,11 @@ class DevicePipelines {
                 return entry;
             })
         });
-        const module = device.createShaderModule({
-            code: makeShaderCode(targetFormat, filterCode, Math.min(numMips, this.maxMipsPerPass)),
-        });
         return new SPDPipeline(mipsBindGroupLayout, device.createComputePipeline({
             compute: {
-                module,
+                module: device.createShaderModule({
+                    code: makeShaderCode(targetFormat, filterCode, Math.min(numMips, this.maxMipsPerPass), precision === SPDPrecision.F16),
+                }),
                 entryPoint: 'downsample',
             },
             layout: device.createPipelineLayout({
@@ -712,20 +744,24 @@ class DevicePipelines {
             }),
         }));
     }
-    getOrCreatePipeline(targetFormat, filterCode, numMipsToCreate) {
+    getOrCreatePipeline(targetFormat, filterCode, numMipsToCreate, precision) {
+        const sanitizedPrecision = this.sanitizePrecision(precision);
         if (!this.pipelines.has(targetFormat)) {
             this.pipelines.set(targetFormat, new Map());
         }
-        if (!this.pipelines.get(targetFormat)?.has(filterCode)) {
-            this.pipelines.get(targetFormat)?.set(filterCode, new Map());
+        if (!this.pipelines.get(targetFormat)?.has(sanitizedPrecision)) {
+            this.pipelines.get(targetFormat)?.set(sanitizedPrecision, new Map());
+        }
+        if (!this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.has(filterCode)) {
+            this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.set(filterCode, new Map());
         }
-        if (!this.pipelines.get(targetFormat)?.get(filterCode)?.has(numMipsToCreate)) {
-            const pipelines = this.createPipeline(targetFormat, filterCode, numMipsToCreate);
+        if (!this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.get(filterCode)?.has(numMipsToCreate)) {
+            const pipelines = this.createPipeline(targetFormat, filterCode, numMipsToCreate, sanitizedPrecision);
             if (pipelines) {
-                this.pipelines.get(targetFormat)?.get(filterCode)?.set(numMipsToCreate, pipelines);
+                this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.get(filterCode)?.set(numMipsToCreate, pipelines);
             }
         }
-        return this.pipelines.get(targetFormat)?.get(filterCode)?.get(numMipsToCreate);
+        return this.pipelines.get(targetFormat)?.get(sanitizedPrecision)?.get(filterCode)?.get(numMipsToCreate);
     }
     getOrCreateMidMipBuffer(device, numArrayLayers) {
         if (!this.midMipBuffers.has(numArrayLayers)) {
@@ -783,7 +819,7 @@ class DevicePipelines {
             });
         }
     }
-    preparePass(texture, target, filterCode, offset, size, numMipsTotal) {
+    preparePass(texture, target, filterCode, offset, size, numMipsTotal, precision) {
         const device = this.device.deref();
         if (!device) {
             return undefined;
@@ -805,7 +841,7 @@ class DevicePipelines {
                     numArrayLayers: numArrayLayersThisPass,
                 });
                 // todo: handle missing pipeline
-                const pipeline = this.getOrCreatePipeline(target.format, filterCode, numMipsThisPass);
+                const pipeline = this.getOrCreatePipeline(target.format, filterCode, numMipsThisPass, precision);
                 const mipViews = Array(numMipsThisPass + 1).fill(0).map((_, i) => {
                     if (baseMip === 0 && i === 0) {
                         return texture.createView({
@@ -923,7 +959,7 @@ export class WebGPUSinglePassDownsampler {
      *
      * The given WGSL code must (at least) specify a function to reduce four values into one with the following name and signature:
      *
-     *   spd_reduce_4(v0: vec4<f32>, v1: vec4<f32>, v2: vec4<f32>, v3: vec4<f32>) -> vec4<f32>
+     *   spd_reduce_4(v0: vec4<SPDFloat>, v1: vec4<SPDFloat>, v2: vec4<SPDFloat>, v3: vec4<SPDFloat>) -> vec4<SPDFloat>
      *
      * @param name The unique name of the filter operation
      * @param wgsl The WGSL code to inject into the downsampling shader as the filter operation
@@ -988,7 +1024,7 @@ export class WebGPUSinglePassDownsampler {
             console.warn(`[GPUSinglePassDownsampler::prepare]: filter ${filter} makes no sense for one-component target format ${target.format}`);
         }
         const filterCode = this.filters.get(filter) ?? SPD_FILTER_AVERAGE;
-        return this.getOrCreateDevicePipelines(device)?.preparePass(texture, target, filterCode, offset, size, numMips);
+        return this.getOrCreateDevicePipelines(device)?.preparePass(texture, target, filterCode, offset, size, numMips, config?.precision ?? SPDPrecision.F32);
     }
     /**
      * Generates mipmaps for the given texture.