From 3188e5af619577a2b5d8192d9cc4154ebe9e2527 Mon Sep 17 00:00:00 2001 From: Patrick Walton Date: Tue, 10 Dec 2024 09:50:03 -0800 Subject: [PATCH] Batch skinned meshes on platforms where storage buffers are available. (#16599) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit makes skinned meshes batchable on platforms other than WebGL 2. On supported platforms, it replaces the two uniform buffers used for joint matrices with a pair of storage buffers containing all matrices for all skinned meshes packed together. The indices into the buffer are stored in the mesh uniform and mesh input uniform. The GPU mesh preprocessing step copies the indices in if that step is enabled. On the `many_foxes` demo, I observed a frame time decrease from 15.470ms to 11.935ms. This is the result of reducing the `submit_graph_commands` time from an average of 5.45ms to 0.489ms, an 11x speedup in that portion of rendering. ![Screenshot 2024-12-01 192838](https://github.com/user-attachments/assets/7d2db997-8939-466e-8b9e-050d4a6a78ee) This is what the profile looks like for `many_foxes` after these changes. ![Screenshot 2024-12-01 193026](https://github.com/user-attachments/assets/68983fc3-01b8-41fd-835e-3d93cb65d0fa) --------- Co-authored-by: François Mockers --- .../bevy_pbr/src/meshlet/instance_manager.rs | 9 +- crates/bevy_pbr/src/prepass/mod.rs | 7 ++ crates/bevy_pbr/src/prepass/prepass.wgsl | 7 +- crates/bevy_pbr/src/render/mesh.rs | 113 +++++++++++++----- crates/bevy_pbr/src/render/mesh.wgsl | 6 +- crates/bevy_pbr/src/render/mesh_bindings.rs | 71 +++++++---- .../bevy_pbr/src/render/mesh_preprocess.wgsl | 6 +- crates/bevy_pbr/src/render/mesh_types.wgsl | 4 +- crates/bevy_pbr/src/render/mod.rs | 2 +- crates/bevy_pbr/src/render/skin.rs | 56 +++++++-- crates/bevy_pbr/src/render/skinning.wgsl | 27 +++++ 11 files changed, 236 insertions(+), 72 deletions(-) diff --git a/crates/bevy_pbr/src/meshlet/instance_manager.rs b/crates/bevy_pbr/src/meshlet/instance_manager.rs index f0c47fddf6072..fcc06c72f5140 100644 --- a/crates/bevy_pbr/src/meshlet/instance_manager.rs +++ b/crates/bevy_pbr/src/meshlet/instance_manager.rs @@ -120,7 +120,14 @@ impl InstanceManager { return; }; - let mesh_uniform = MeshUniform::new(&transforms, 0, mesh_material_binding_id.slot, None); + let mesh_uniform = MeshUniform::new( + &transforms, + 0, + mesh_material_binding_id.slot, + None, + None, + None, + ); // Append instance data self.instances.push(( diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs index 2af3c7780c720..1c2f1f4ec27a5 100644 --- a/crates/bevy_pbr/src/prepass/mod.rs +++ b/crates/bevy_pbr/src/prepass/mod.rs @@ -253,6 +253,11 @@ pub struct PrepassPipeline { pub deferred_material_vertex_shader: Option>, pub deferred_material_fragment_shader: Option>, pub material_pipeline: MaterialPipeline, + + /// Whether skins will use uniform buffers on account of storage buffers + /// being unavailable on this platform. + pub skins_use_uniform_buffers: bool, + pub depth_clip_control_supported: bool, _marker: PhantomData, } @@ -345,6 +350,7 @@ impl FromWorld for PrepassPipeline { }, material_layout: M::bind_group_layout(render_device), material_pipeline: world.resource::>().clone(), + skins_use_uniform_buffers: skin::skins_use_uniform_buffers(render_device), depth_clip_control_supported, _marker: PhantomData, } @@ -521,6 +527,7 @@ where &key.mesh_key, &mut shader_defs, &mut vertex_attributes, + self.skins_use_uniform_buffers, ); bind_group_layouts.insert(1, bind_group); diff --git a/crates/bevy_pbr/src/prepass/prepass.wgsl b/crates/bevy_pbr/src/prepass/prepass.wgsl index 8f7d45c2fd476..26011d609b50c 100644 --- a/crates/bevy_pbr/src/prepass/prepass.wgsl +++ b/crates/bevy_pbr/src/prepass/prepass.wgsl @@ -69,7 +69,11 @@ fn vertex(vertex_no_morph: Vertex) -> VertexOutput { let mesh_world_from_local = mesh_functions::get_world_from_local(vertex_no_morph.instance_index); #ifdef SKINNED - var world_from_local = skinning::skin_model(vertex.joint_indices, vertex.joint_weights); + var world_from_local = skinning::skin_model( + vertex.joint_indices, + vertex.joint_weights, + vertex_no_morph.instance_index + ); #else // SKINNED // Use vertex_no_morph.instance_index instead of vertex.instance_index to work around a wgpu dx12 bug. // See https://github.com/gfx-rs/naga/issues/2416 @@ -142,6 +146,7 @@ fn vertex(vertex_no_morph: Vertex) -> VertexOutput { let prev_model = skinning::skin_prev_model( prev_vertex.joint_indices, prev_vertex.joint_weights, + vertex_no_morph.instance_index ); #else // HAS_PREVIOUS_SKIN let prev_model = mesh_functions::get_previous_world_from_local(prev_vertex.instance_index); diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs index 450579fb70970..29f878522e3a3 100644 --- a/crates/bevy_pbr/src/render/mesh.rs +++ b/crates/bevy_pbr/src/render/mesh.rs @@ -49,6 +49,7 @@ use bevy_utils::{ HashMap, Parallel, }; use material_bind_groups::MaterialBindingId; +use render::skin::{self, SkinIndex}; use crate::{ render::{ @@ -152,7 +153,6 @@ impl Plugin for MeshRenderPlugin { if let Some(render_app) = app.get_sub_app_mut(RenderApp) { render_app .init_resource::() - .init_resource::() .init_resource::() .init_resource::() .init_resource::() @@ -189,7 +189,9 @@ impl Plugin for MeshRenderPlugin { let mut mesh_bindings_shader_defs = Vec::with_capacity(1); if let Some(render_app) = app.get_sub_app_mut(RenderApp) { - render_app.init_resource::(); + render_app + .init_resource::() + .init_resource::(); let gpu_preprocessing_support = render_app.world().resource::(); @@ -220,6 +222,7 @@ impl Plugin for MeshRenderPlugin { collect_meshes_for_gpu_building .in_set(RenderSet::PrepareAssets) .after(allocator::allocate_and_free_meshes) + .after(extract_skins) // This must be before // `set_mesh_motion_vector_flags` so it doesn't // overwrite those flags. @@ -307,12 +310,12 @@ pub struct MeshUniform { /// [`MeshAllocator`]). This value stores the offset of the first vertex in /// this mesh in that buffer. pub first_vertex_index: u32, + /// The current skin index, or `u32::MAX` if there's no skin. + pub current_skin_index: u32, + /// The previous skin index, or `u32::MAX` if there's no previous skin. + pub previous_skin_index: u32, /// Index of the material inside the bind group data. pub material_bind_group_slot: u32, - /// Padding. - pub pad_a: u32, - /// Padding. - pub pad_b: u32, } /// Information that has to be transferred from CPU to GPU in order to produce @@ -349,12 +352,12 @@ pub struct MeshInputUniform { /// [`MeshAllocator`]). This value stores the offset of the first vertex in /// this mesh in that buffer. pub first_vertex_index: u32, + /// The current skin index, or `u32::MAX` if there's no skin. + pub current_skin_index: u32, + /// The previous skin index, or `u32::MAX` if there's no previous skin. + pub previous_skin_index: u32, /// Index of the material inside the bind group data. pub material_bind_group_slot: u32, - /// Padding. - pub pad_a: u32, - /// Padding. - pub pad_b: u32, } /// Information about each mesh instance needed to cull it on GPU. @@ -386,6 +389,8 @@ impl MeshUniform { first_vertex_index: u32, material_bind_group_slot: MaterialBindGroupSlot, maybe_lightmap_uv_rect: Option, + current_skin_index: Option, + previous_skin_index: Option, ) -> Self { let (local_from_world_transpose_a, local_from_world_transpose_b) = mesh_transforms.world_from_local.inverse_transpose_3x3(); @@ -397,9 +402,9 @@ impl MeshUniform { local_from_world_transpose_b, flags: mesh_transforms.flags, first_vertex_index, + current_skin_index: current_skin_index.unwrap_or(u32::MAX), + previous_skin_index: previous_skin_index.unwrap_or(u32::MAX), material_bind_group_slot: *material_bind_group_slot, - pad_a: 0, - pad_b: 0, } } } @@ -880,6 +885,7 @@ impl RenderMeshInstanceGpuBuilder { current_input_buffer: &mut InstanceInputUniformBuffer, previous_input_buffer: &mut InstanceInputUniformBuffer, mesh_allocator: &MeshAllocator, + skin_indices: &SkinIndices, ) -> u32 { let first_vertex_index = match mesh_allocator.mesh_vertex_slice(&self.shared.mesh_asset_id) { @@ -887,6 +893,15 @@ impl RenderMeshInstanceGpuBuilder { None => 0, }; + let current_skin_index = match skin_indices.current.get(&entity) { + Some(skin_indices) => skin_indices.index(), + None => u32::MAX, + }; + let previous_skin_index = match skin_indices.prev.get(&entity) { + Some(skin_indices) => skin_indices.index(), + None => u32::MAX, + }; + // Create the mesh input uniform. let mut mesh_input_uniform = MeshInputUniform { world_from_local: self.world_from_local.to_transpose(), @@ -894,9 +909,9 @@ impl RenderMeshInstanceGpuBuilder { flags: self.mesh_flags.bits(), previous_input_index: u32::MAX, first_vertex_index, + current_skin_index, + previous_skin_index, material_bind_group_slot: *self.shared.material_bindings_index.slot, - pad_a: 0, - pad_b: 0, }; // Did the last frame contain this entity as well? @@ -1312,6 +1327,7 @@ pub fn collect_meshes_for_gpu_building( mut mesh_culling_data_buffer: ResMut, mut render_mesh_instance_queues: ResMut, mesh_allocator: Res, + skin_indices: Res, ) { let RenderMeshInstances::GpuBuilding(ref mut render_mesh_instances) = render_mesh_instances.into_inner() @@ -1347,6 +1363,7 @@ pub fn collect_meshes_for_gpu_building( current_input_buffer, previous_input_buffer, &mesh_allocator, + &skin_indices, ); } @@ -1370,6 +1387,7 @@ pub fn collect_meshes_for_gpu_building( current_input_buffer, previous_input_buffer, &mesh_allocator, + &skin_indices, ); mesh_culling_builder .update(&mut mesh_culling_data_buffer, instance_data_index as usize); @@ -1417,6 +1435,10 @@ pub struct MeshPipeline { /// /// This affects whether reflection probes can be used. pub binding_arrays_are_usable: bool, + + /// Whether skins will use uniform buffers on account of storage buffers + /// being unavailable on this platform. + pub skins_use_uniform_buffers: bool, } impl FromWorld for MeshPipeline { @@ -1474,6 +1496,7 @@ impl FromWorld for MeshPipeline { mesh_layouts: MeshLayouts::new(&render_device), per_object_buffer_batch_size: GpuArrayBuffer::::batch_size(&render_device), binding_arrays_are_usable: binding_arrays_are_usable(&render_device), + skins_use_uniform_buffers: skin::skins_use_uniform_buffers(&render_device), } } } @@ -1506,6 +1529,7 @@ impl GetBatchData for MeshPipeline { SRes, SRes>, SRes, + SRes, ); // The material bind group ID, the mesh ID, and the lightmap ID, // respectively. @@ -1518,7 +1542,7 @@ impl GetBatchData for MeshPipeline { type BufferData = MeshUniform; fn get_batch_data( - (mesh_instances, lightmaps, _, mesh_allocator): &SystemParamItem, + (mesh_instances, lightmaps, _, mesh_allocator, skin_indices): &SystemParamItem, (_entity, main_entity): (Entity, MainEntity), ) -> Option<(Self::BufferData, Option)> { let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else { @@ -1536,6 +1560,9 @@ impl GetBatchData for MeshPipeline { }; let maybe_lightmap = lightmaps.render_lightmaps.get(&main_entity); + let current_skin_index = skin_indices.current.get(&main_entity).map(SkinIndex::index); + let previous_skin_index = skin_indices.prev.get(&main_entity).map(SkinIndex::index); + let material_bind_group_index = mesh_instance.material_bindings_index; Some(( @@ -1544,6 +1571,8 @@ impl GetBatchData for MeshPipeline { first_vertex_index, material_bind_group_index.slot, maybe_lightmap.map(|lightmap| lightmap.uv_rect), + current_skin_index, + previous_skin_index, ), mesh_instance.should_batch().then_some(( material_bind_group_index.group, @@ -1558,7 +1587,7 @@ impl GetFullBatchData for MeshPipeline { type BufferInputData = MeshInputUniform; fn get_index_and_compare_data( - (mesh_instances, lightmaps, _, _): &SystemParamItem, + (mesh_instances, lightmaps, _, _, _): &SystemParamItem, (_entity, main_entity): (Entity, MainEntity), ) -> Option<(NonMaxU32, Option)> { // This should only be called during GPU building. @@ -1584,7 +1613,7 @@ impl GetFullBatchData for MeshPipeline { } fn get_binned_batch_data( - (mesh_instances, lightmaps, _, mesh_allocator): &SystemParamItem, + (mesh_instances, lightmaps, _, mesh_allocator, skin_indices): &SystemParamItem, (_entity, main_entity): (Entity, MainEntity), ) -> Option { let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else { @@ -1601,16 +1630,21 @@ impl GetFullBatchData for MeshPipeline { }; let maybe_lightmap = lightmaps.render_lightmaps.get(&main_entity); + let current_skin_index = skin_indices.current.get(&main_entity).map(SkinIndex::index); + let previous_skin_index = skin_indices.prev.get(&main_entity).map(SkinIndex::index); + Some(MeshUniform::new( &mesh_instance.transforms, first_vertex_index, mesh_instance.material_bindings_index.slot, maybe_lightmap.map(|lightmap| lightmap.uv_rect), + current_skin_index, + previous_skin_index, )) } fn get_binned_index( - (mesh_instances, _, _, _): &SystemParamItem, + (mesh_instances, _, _, _, _): &SystemParamItem, (_entity, main_entity): (Entity, MainEntity), ) -> Option { // This should only be called during GPU building. @@ -1628,7 +1662,7 @@ impl GetFullBatchData for MeshPipeline { } fn get_batch_indirect_parameters_index( - (mesh_instances, _, meshes, mesh_allocator): &SystemParamItem, + (mesh_instances, _, meshes, mesh_allocator, _): &SystemParamItem, indirect_parameters_buffer: &mut IndirectParametersBuffer, entity: (Entity, MainEntity), instance_index: u32, @@ -1868,15 +1902,22 @@ pub fn setup_morph_and_skinning_defs( key: &MeshPipelineKey, shader_defs: &mut Vec, vertex_attributes: &mut Vec, + skins_use_uniform_buffers: bool, ) -> BindGroupLayout { + let is_morphed = key.intersects(MeshPipelineKey::MORPH_TARGETS); + let is_lightmapped = key.intersects(MeshPipelineKey::LIGHTMAPPED); + let motion_vector_prepass = key.intersects(MeshPipelineKey::MOTION_VECTOR_PREPASS); + + if skins_use_uniform_buffers { + shader_defs.push("SKINS_USE_UNIFORM_BUFFERS".into()); + } + let mut add_skin_data = || { shader_defs.push("SKINNED".into()); vertex_attributes.push(Mesh::ATTRIBUTE_JOINT_INDEX.at_shader_location(offset)); vertex_attributes.push(Mesh::ATTRIBUTE_JOINT_WEIGHT.at_shader_location(offset + 1)); }; - let is_morphed = key.intersects(MeshPipelineKey::MORPH_TARGETS); - let is_lightmapped = key.intersects(MeshPipelineKey::LIGHTMAPPED); - let motion_vector_prepass = key.intersects(MeshPipelineKey::MOTION_VECTOR_PREPASS); + match ( is_skinned(layout), is_morphed, @@ -1985,6 +2026,7 @@ impl SpecializedMeshPipeline for MeshPipeline { &key, &mut shader_defs, &mut vertex_attributes, + self.skins_use_uniform_buffers, )); if key.contains(MeshPipelineKey::SCREEN_SPACE_AMBIENT_OCCLUSION) { @@ -2477,6 +2519,7 @@ impl RenderCommand

for SetMeshViewBindGroup pub struct SetMeshBindGroup; impl RenderCommand

for SetMeshBindGroup { type Param = ( + SRes, SRes, SRes, SRes, @@ -2491,11 +2534,14 @@ impl RenderCommand

for SetMeshBindGroup { item: &P, has_motion_vector_prepass: bool, _item_query: Option<()>, - (bind_groups, mesh_instances, skin_indices, morph_indices, lightmaps): SystemParamItem< - 'w, - '_, - Self::Param, - >, + ( + render_device, + bind_groups, + mesh_instances, + skin_indices, + morph_indices, + lightmaps, + ): SystemParamItem<'w, '_, Self::Param>, pass: &mut TrackedRenderPass<'w>, ) -> RenderCommandResult { let bind_groups = bind_groups.into_inner(); @@ -2508,6 +2554,7 @@ impl RenderCommand

for SetMeshBindGroup { let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(*entity) else { return RenderCommandResult::Success; }; + let current_skin_index = skin_indices.current.get(entity); let prev_skin_index = skin_indices.prev.get(entity); let current_morph_index = morph_indices.current.get(entity); @@ -2542,8 +2589,10 @@ impl RenderCommand

for SetMeshBindGroup { offset_count += 1; } if let Some(current_skin_index) = current_skin_index { - dynamic_offsets[offset_count] = current_skin_index.index; - offset_count += 1; + if skin::skins_use_uniform_buffers(&render_device) { + dynamic_offsets[offset_count] = current_skin_index.byte_offset; + offset_count += 1; + } } if let Some(current_morph_index) = current_morph_index { dynamic_offsets[offset_count] = current_morph_index.index; @@ -2554,9 +2603,11 @@ impl RenderCommand

for SetMeshBindGroup { if has_motion_vector_prepass { // Attach the previous skin index for motion vector computation. If // there isn't one, just use zero as the shader will ignore it. - if current_skin_index.is_some() { + if current_skin_index.is_some() && skin::skins_use_uniform_buffers(&render_device) { match prev_skin_index { - Some(prev_skin_index) => dynamic_offsets[offset_count] = prev_skin_index.index, + Some(prev_skin_index) => { + dynamic_offsets[offset_count] = prev_skin_index.byte_offset; + } None => dynamic_offsets[offset_count] = 0, } offset_count += 1; diff --git a/crates/bevy_pbr/src/render/mesh.wgsl b/crates/bevy_pbr/src/render/mesh.wgsl index 3971a53902ef8..95684684f5140 100644 --- a/crates/bevy_pbr/src/render/mesh.wgsl +++ b/crates/bevy_pbr/src/render/mesh.wgsl @@ -44,7 +44,11 @@ fn vertex(vertex_no_morph: Vertex) -> VertexOutput { let mesh_world_from_local = mesh_functions::get_world_from_local(vertex_no_morph.instance_index); #ifdef SKINNED - var world_from_local = skinning::skin_model(vertex.joint_indices, vertex.joint_weights); + var world_from_local = skinning::skin_model( + vertex.joint_indices, + vertex.joint_weights, + vertex_no_morph.instance_index + ); #else // Use vertex_no_morph.instance_index instead of vertex.instance_index to work around a wgpu dx12 bug. // See https://github.com/gfx-rs/naga/issues/2416 . diff --git a/crates/bevy_pbr/src/render/mesh_bindings.rs b/crates/bevy_pbr/src/render/mesh_bindings.rs index b458b3f98e60e..cc1c5bec23bb7 100644 --- a/crates/bevy_pbr/src/render/mesh_bindings.rs +++ b/crates/bevy_pbr/src/render/mesh_bindings.rs @@ -22,10 +22,13 @@ pub(crate) const JOINT_BUFFER_SIZE: usize = MAX_JOINTS * JOINT_SIZE; /// Individual layout entries. mod layout_entry { use super::{JOINT_BUFFER_SIZE, MORPH_BUFFER_SIZE}; - use crate::MeshUniform; + use crate::{render::skin, MeshUniform}; use bevy_render::{ render_resource::{ - binding_types::{sampler, texture_2d, texture_3d, uniform_buffer_sized}, + binding_types::{ + sampler, storage_buffer_read_only_sized, texture_2d, texture_3d, + uniform_buffer_sized, + }, BindGroupLayoutEntryBuilder, BufferSize, GpuArrayBuffer, SamplerBindingType, ShaderStages, TextureSampleType, }, @@ -36,8 +39,15 @@ mod layout_entry { GpuArrayBuffer::::binding_layout(render_device) .visibility(ShaderStages::VERTEX_FRAGMENT) } - pub(super) fn skinning() -> BindGroupLayoutEntryBuilder { - uniform_buffer_sized(true, BufferSize::new(JOINT_BUFFER_SIZE as u64)) + pub(super) fn skinning(render_device: &RenderDevice) -> BindGroupLayoutEntryBuilder { + // If we can use storage buffers, do so. Otherwise, fall back to uniform + // buffers. + let size = BufferSize::new(JOINT_BUFFER_SIZE as u64); + if skin::skins_use_uniform_buffers(render_device) { + uniform_buffer_sized(true, size) + } else { + storage_buffer_read_only_sized(false, size) + } } pub(super) fn weights() -> BindGroupLayoutEntryBuilder { uniform_buffer_sized(true, BufferSize::new(MORPH_BUFFER_SIZE as u64)) @@ -56,29 +66,44 @@ mod layout_entry { /// Individual [`BindGroupEntry`] /// for bind groups. mod entry { + use crate::render::skin; + use super::{JOINT_BUFFER_SIZE, MORPH_BUFFER_SIZE}; - use bevy_render::render_resource::{ - BindGroupEntry, BindingResource, Buffer, BufferBinding, BufferSize, Sampler, TextureView, + use bevy_render::{ + render_resource::{ + BindGroupEntry, BindingResource, Buffer, BufferBinding, BufferSize, Sampler, + TextureView, + }, + renderer::RenderDevice, }; - fn entry(binding: u32, size: u64, buffer: &Buffer) -> BindGroupEntry { + fn entry(binding: u32, size: Option, buffer: &Buffer) -> BindGroupEntry { BindGroupEntry { binding, resource: BindingResource::Buffer(BufferBinding { buffer, offset: 0, - size: Some(BufferSize::new(size).unwrap()), + size: size.map(|size| BufferSize::new(size).unwrap()), }), } } pub(super) fn model(binding: u32, resource: BindingResource) -> BindGroupEntry { BindGroupEntry { binding, resource } } - pub(super) fn skinning(binding: u32, buffer: &Buffer) -> BindGroupEntry { - entry(binding, JOINT_BUFFER_SIZE as u64, buffer) + pub(super) fn skinning<'a>( + render_device: &RenderDevice, + binding: u32, + buffer: &'a Buffer, + ) -> BindGroupEntry<'a> { + let size = if skin::skins_use_uniform_buffers(render_device) { + Some(JOINT_BUFFER_SIZE as u64) + } else { + None + }; + entry(binding, size, buffer) } pub(super) fn weights(binding: u32, buffer: &Buffer) -> BindGroupEntry { - entry(binding, MORPH_BUFFER_SIZE as u64, buffer) + entry(binding, Some(MORPH_BUFFER_SIZE as u64), buffer) } pub(super) fn targets(binding: u32, texture: &TextureView) -> BindGroupEntry { BindGroupEntry { @@ -175,7 +200,7 @@ impl MeshLayouts { ( (0, layout_entry::model(render_device)), // The current frame's joint matrix buffer. - (1, layout_entry::skinning()), + (1, layout_entry::skinning(render_device)), ), ), ) @@ -191,9 +216,9 @@ impl MeshLayouts { ( (0, layout_entry::model(render_device)), // The current frame's joint matrix buffer. - (1, layout_entry::skinning()), + (1, layout_entry::skinning(render_device)), // The previous frame's joint matrix buffer. - (6, layout_entry::skinning()), + (6, layout_entry::skinning(render_device)), ), ), ) @@ -244,7 +269,7 @@ impl MeshLayouts { ( (0, layout_entry::model(render_device)), // The current frame's joint matrix buffer. - (1, layout_entry::skinning()), + (1, layout_entry::skinning(render_device)), // The current frame's morph weight buffer. (2, layout_entry::weights()), (3, layout_entry::targets()), @@ -263,12 +288,12 @@ impl MeshLayouts { ( (0, layout_entry::model(render_device)), // The current frame's joint matrix buffer. - (1, layout_entry::skinning()), + (1, layout_entry::skinning(render_device)), // The current frame's morph weight buffer. (2, layout_entry::weights()), (3, layout_entry::targets()), // The previous frame's joint matrix buffer. - (6, layout_entry::skinning()), + (6, layout_entry::skinning(render_device)), // The previous frame's morph weight buffer. (7, layout_entry::weights()), ), @@ -329,7 +354,7 @@ impl MeshLayouts { &self.skinned, &[ entry::model(0, model.clone()), - entry::skinning(1, current_skin), + entry::skinning(render_device, 1, current_skin), ], ) } @@ -353,8 +378,8 @@ impl MeshLayouts { &self.skinned_motion, &[ entry::model(0, model.clone()), - entry::skinning(1, current_skin), - entry::skinning(6, prev_skin), + entry::skinning(render_device, 1, current_skin), + entry::skinning(render_device, 6, prev_skin), ], ) } @@ -420,7 +445,7 @@ impl MeshLayouts { &self.morphed_skinned, &[ entry::model(0, model.clone()), - entry::skinning(1, current_skin), + entry::skinning(render_device, 1, current_skin), entry::weights(2, current_weights), entry::targets(3, targets), ], @@ -450,10 +475,10 @@ impl MeshLayouts { &self.morphed_skinned_motion, &[ entry::model(0, model.clone()), - entry::skinning(1, current_skin), + entry::skinning(render_device, 1, current_skin), entry::weights(2, current_weights), entry::targets(3, targets), - entry::skinning(6, prev_skin), + entry::skinning(render_device, 6, prev_skin), entry::weights(7, prev_weights), ], ) diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl index 5132691930cad..3300c4d925d6b 100644 --- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl +++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl @@ -23,10 +23,10 @@ struct MeshInput { // applicable. If not present, this is `u32::MAX`. previous_input_index: u32, first_vertex_index: u32, + current_skin_index: u32, + previous_skin_index: u32, // Index of the material inside the bind group data. material_bind_group_slot: u32, - pad_a: u32, - pad_b: u32, } // Information about each mesh instance needed to cull it on GPU. @@ -192,6 +192,8 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { output[mesh_output_index].flags = current_input[input_index].flags; output[mesh_output_index].lightmap_uv_rect = current_input[input_index].lightmap_uv_rect; output[mesh_output_index].first_vertex_index = current_input[input_index].first_vertex_index; + output[mesh_output_index].current_skin_index = current_input[input_index].current_skin_index; + output[mesh_output_index].previous_skin_index = current_input[input_index].previous_skin_index; output[mesh_output_index].material_bind_group_slot = current_input[input_index].material_bind_group_slot; } diff --git a/crates/bevy_pbr/src/render/mesh_types.wgsl b/crates/bevy_pbr/src/render/mesh_types.wgsl index 7cf8cdf7ed512..f94074d813465 100644 --- a/crates/bevy_pbr/src/render/mesh_types.wgsl +++ b/crates/bevy_pbr/src/render/mesh_types.wgsl @@ -17,10 +17,10 @@ struct Mesh { lightmap_uv_rect: vec2, // The index of the mesh's first vertex in the vertex buffer. first_vertex_index: u32, + current_skin_index: u32, + previous_skin_index: u32, // Index of the material inside the bind group data. material_bind_group_slot: u32, - pad_a: u32, - pad_b: u32, }; #ifdef SKINNED diff --git a/crates/bevy_pbr/src/render/mod.rs b/crates/bevy_pbr/src/render/mod.rs index 2a69e28bf3a44..8e26e869a1c96 100644 --- a/crates/bevy_pbr/src/render/mod.rs +++ b/crates/bevy_pbr/src/render/mod.rs @@ -5,7 +5,7 @@ pub(crate) mod mesh; mod mesh_bindings; mod mesh_view_bindings; mod morph; -mod skin; +pub(crate) mod skin; pub use fog::*; pub use gpu_preprocess::*; diff --git a/crates/bevy_pbr/src/render/skin.rs b/crates/bevy_pbr/src/render/skin.rs index 1f1ba39f8b294..c248821ccafd3 100644 --- a/crates/bevy_pbr/src/render/skin.rs +++ b/crates/bevy_pbr/src/render/skin.rs @@ -1,4 +1,5 @@ use core::mem::{self, size_of}; +use std::sync::OnceLock; use bevy_asset::Assets; use bevy_ecs::prelude::*; @@ -23,18 +24,27 @@ use bevy_transform::prelude::GlobalTransform; /// of the GPU at runtime, which would mean not using consts anymore. pub const MAX_JOINTS: usize = 256; +/// The location of the first joint matrix in the skin uniform buffer. #[derive(Component)] pub struct SkinIndex { - pub index: u32, + /// The byte offset of the first joint matrix. + pub byte_offset: u32, } impl SkinIndex { /// Index to be in address space based on the size of a skin uniform. const fn new(start: usize) -> Self { SkinIndex { - index: (start * size_of::()) as u32, + byte_offset: (start * size_of::()) as u32, } } + + /// Returns this skin index in elements (not bytes). + /// + /// Each element is a 4x4 matrix. + pub fn index(&self) -> u32 { + self.byte_offset / size_of::() as u32 + } } /// Maps each skinned mesh to the applicable offset within the [`SkinUniforms`] @@ -70,15 +80,30 @@ pub struct SkinUniforms { pub prev_buffer: RawBufferVec, } -impl Default for SkinUniforms { - fn default() -> Self { +impl FromWorld for SkinUniforms { + fn from_world(world: &mut World) -> Self { + let device = world.resource::(); + let buffer_usages = if skins_use_uniform_buffers(device) { + BufferUsages::UNIFORM + } else { + BufferUsages::STORAGE + }; + Self { - current_buffer: RawBufferVec::new(BufferUsages::UNIFORM), - prev_buffer: RawBufferVec::new(BufferUsages::UNIFORM), + current_buffer: RawBufferVec::new(buffer_usages), + prev_buffer: RawBufferVec::new(buffer_usages), } } } +/// Returns true if skinning must use uniforms (and dynamic offsets) because +/// storage buffers aren't supported on the current platform. +pub fn skins_use_uniform_buffers(render_device: &RenderDevice) -> bool { + static SKINS_USE_UNIFORM_BUFFERS: OnceLock = OnceLock::new(); + *SKINS_USE_UNIFORM_BUFFERS + .get_or_init(|| render_device.limits().max_storage_buffers_per_shader_stage == 0) +} + pub fn prepare_skins( render_device: Res, render_queue: Res, @@ -130,7 +155,10 @@ pub fn extract_skins( query: Extract>, inverse_bindposes: Extract>>, joints: Extract>, + render_device: Res, ) { + let skins_use_uniform_buffers = skins_use_uniform_buffers(&render_device); + // Borrow check workaround. let (skin_indices, uniform) = (skin_indices.into_inner(), uniform.into_inner()); @@ -170,9 +198,12 @@ pub fn extract_skins( } last_start = last_start.max(start); - // Pad to 256 byte alignment - while buffer.len() % 4 != 0 { - buffer.push(Mat4::ZERO); + // Pad to 256 byte alignment if we're using a uniform buffer. + // There's no need to do this if we're using storage buffers, though. + if skins_use_uniform_buffers { + while buffer.len() % 4 != 0 { + buffer.push(Mat4::ZERO); + } } skin_indices @@ -187,11 +218,16 @@ pub fn extract_skins( } // NOTE: The skinned joints uniform buffer has to be bound at a dynamic offset per -// entity and so cannot currently be batched. +// entity and so cannot currently be batched on WebGL 2. pub fn no_automatic_skin_batching( mut commands: Commands, query: Query, Without)>, + render_device: Res, ) { + if !skins_use_uniform_buffers(&render_device) { + return; + } + for entity in &query { commands.entity(entity).try_insert(NoAutomaticBatching); } diff --git a/crates/bevy_pbr/src/render/skinning.wgsl b/crates/bevy_pbr/src/render/skinning.wgsl index 1ed9393308995..92e977aeb1b92 100644 --- a/crates/bevy_pbr/src/render/skinning.wgsl +++ b/crates/bevy_pbr/src/render/skinning.wgsl @@ -1,10 +1,15 @@ #define_import_path bevy_pbr::skinning #import bevy_pbr::mesh_types::SkinnedMesh +#import bevy_pbr::mesh_bindings::mesh #ifdef SKINNED +#ifdef SKINS_USE_UNIFORM_BUFFERS @group(1) @binding(1) var joint_matrices: SkinnedMesh; +#else // SKINS_USE_UNIFORM_BUFFERS +@group(1) @binding(1) var joint_matrices: array>; +#endif // SKINS_USE_UNIFORM_BUFFERS // An array of matrices specifying the joint positions from the previous frame. // @@ -12,16 +17,29 @@ // // If this is the first frame, or we're otherwise prevented from using data from // the previous frame, this is simply the same as `joint_matrices` above. +#ifdef SKINS_USE_UNIFORM_BUFFERS @group(1) @binding(6) var prev_joint_matrices: SkinnedMesh; +#else // SKINS_USE_UNIFORM_BUFFERS +@group(1) @binding(6) var prev_joint_matrices: array>; +#endif // SKINS_USE_UNIFORM_BUFFERS fn skin_model( indexes: vec4, weights: vec4, + instance_index: u32, ) -> mat4x4 { +#ifdef SKINS_USE_UNIFORM_BUFFERS return weights.x * joint_matrices.data[indexes.x] + weights.y * joint_matrices.data[indexes.y] + weights.z * joint_matrices.data[indexes.z] + weights.w * joint_matrices.data[indexes.w]; +#else // SKINS_USE_UNIFORM_BUFFERS + let skin_index = mesh[instance_index].current_skin_index; + return weights.x * joint_matrices[skin_index + indexes.x] + + weights.y * joint_matrices[skin_index + indexes.y] + + weights.z * joint_matrices[skin_index + indexes.z] + + weights.w * joint_matrices[skin_index + indexes.w]; +#endif // SKINS_USE_UNIFORM_BUFFERS } // Returns the skinned position of a vertex with the given weights from the @@ -31,11 +49,20 @@ fn skin_model( fn skin_prev_model( indexes: vec4, weights: vec4, + instance_index: u32, ) -> mat4x4 { +#ifdef SKINS_USE_UNIFORM_BUFFERS return weights.x * prev_joint_matrices.data[indexes.x] + weights.y * prev_joint_matrices.data[indexes.y] + weights.z * prev_joint_matrices.data[indexes.z] + weights.w * prev_joint_matrices.data[indexes.w]; +#else // SKINS_USE_UNIFORM_BUFFERS + let skin_index = mesh[instance_index].previous_skin_index; + return weights.x * prev_joint_matrices[skin_index + indexes.x] + + weights.y * prev_joint_matrices[skin_index + indexes.y] + + weights.z * prev_joint_matrices[skin_index + indexes.z] + + weights.w * prev_joint_matrices[skin_index + indexes.w]; +#endif // SKINS_USE_UNIFORM_BUFFERS } fn inverse_transpose_3x3m(in: mat3x3) -> mat3x3 {