From 2efd135fd7247e18a91af5b309156d70cb12650a Mon Sep 17 00:00:00 2001 From: LeshaInc Date: Fri, 14 Jul 2023 00:32:53 +0300 Subject: [PATCH] Reuse buffers for render statistics. Now they work similarly to wgpu-profiler (while collecting more data) --- .../bevy_render/src/renderer/graph_runner.rs | 12 +- crates/bevy_render/src/renderer/mod.rs | 24 +- crates/bevy_render/src/renderer/statistics.rs | 315 ++++++++++++------ 3 files changed, 225 insertions(+), 126 deletions(-) diff --git a/crates/bevy_render/src/renderer/graph_runner.rs b/crates/bevy_render/src/renderer/graph_runner.rs index 790454bc9141d..fe21a4879761b 100644 --- a/crates/bevy_render/src/renderer/graph_runner.rs +++ b/crates/bevy_render/src/renderer/graph_runner.rs @@ -68,14 +68,18 @@ impl RenderGraphRunner { Self::run_graph(graph, None, &mut render_context, world, &[], None)?; finalizer(render_context.command_encoder()); - { + let (render_device, mut statistics_recorder) = { #[cfg(feature = "trace")] let _span = info_span!("submit_graph_commands").entered(); - queue.submit(render_context.finish()); - } + + let (commands, render_device, statistics_recorder) = render_context.finish(); + queue.submit(commands); + + (render_device, statistics_recorder) + }; let render_statistics_mutex = world.resource::().0.clone(); - let statistics_recorder = render_context.download_statistics(queue, move |statistics| { + statistics_recorder.finish_frame(&render_device, move |statistics| { *render_statistics_mutex.lock() = Some(statistics); }); diff --git a/crates/bevy_render/src/renderer/mod.rs b/crates/bevy_render/src/renderer/mod.rs index a2eeed323cd90..ef7781b90f069 100644 --- a/crates/bevy_render/src/renderer/mod.rs +++ b/crates/bevy_render/src/renderer/mod.rs @@ -356,29 +356,23 @@ impl RenderContext { } /// Finalizes the queue and returns the queue of [`CommandBuffer`]s. - pub fn finish(&mut self) -> Vec { + /// + /// When the render statistics become available, `statistics_callback` will be invoked. + pub fn finish(mut self) -> (Vec, RenderDevice, StatisticsRecorder) { let command_encoder = self.command_encoder.get_or_insert_with(|| { self.render_device .create_command_encoder(&wgpu::CommandEncoderDescriptor::default()) }); - self.statistics_recorder - .resolve(command_encoder, &self.render_device); + self.statistics_recorder.resolve(command_encoder); self.flush_encoder(); - std::mem::take(&mut self.command_buffers) - } - /// Downloads [`RenderStatistics`] from GPU, asynchronously calling the callback - /// when the data is available. Should be caled after `finish`. - pub fn download_statistics( - mut self, - queue: &Queue, - callback: impl FnOnce(RenderStatistics) + Send + 'static, - ) -> StatisticsRecorder { - self.statistics_recorder - .download(&self.render_device, queue, callback); - self.statistics_recorder + ( + self.command_buffers, + self.render_device, + self.statistics_recorder, + ) } fn flush_encoder(&mut self) { diff --git a/crates/bevy_render/src/renderer/statistics.rs b/crates/bevy_render/src/renderer/statistics.rs index a73cf0cfab9d6..269ca22fe6924 100644 --- a/crates/bevy_render/src/renderer/statistics.rs +++ b/crates/bevy_render/src/renderer/statistics.rs @@ -1,20 +1,27 @@ -use std::sync::Arc; +use std::sync::{ + atomic::{AtomicBool, Ordering}, + Arc, +}; use bevy_derive::{Deref, DerefMut}; use bevy_ecs::system::{Res, ResMut, Resource}; use bevy_utils::{Duration, HashMap, Instant}; use parking_lot::Mutex; use wgpu::{ - util::DownloadBuffer, Buffer, BufferDescriptor, BufferUsages, CommandEncoder, Features, + Buffer, BufferDescriptor, BufferUsages, CommandEncoder, Features, MapMode, PipelineStatisticsTypes, QuerySet, QuerySetDescriptor, QueryType, Queue, RenderPass, RenderPassDescriptor, }; use super::RenderDevice; +// buffer offset must be divisible by 256, so this constant must be divisible by 32 (=256/8) const MAX_TIMESTAMP_QUERIES: u32 = 256; const MAX_PIPELINE_STATISTICS: u32 = 128; +const TIMESTAMP_SIZE: u64 = 8; +const PIPELINE_STATISTICS_SIZE: u64 = 40; + /// Resource which stores statistics for each render pass. #[derive(Debug, Default, Clone, Resource)] pub struct RenderStatistics(pub HashMap); @@ -57,16 +64,14 @@ struct PassRecord { #[derive(Resource)] pub struct StatisticsRecorder { timestamp_period: f32, - timestamps_query_set: Option, - num_timestamps: u32, - pipeline_statistics_query_set: Option, - num_pipeline_statistics: u32, - pass_records: HashMap, - buffer: Option, + features: Features, + current_frame: FrameData, + submitted_frames: Vec, + finished_frames: Vec, } impl StatisticsRecorder { - /// Creates the new `StatisticRecorder` + /// Creates the new `StatisticRecorder`. pub fn new(device: &RenderDevice, queue: &Queue) -> StatisticsRecorder { let features = device.features(); @@ -76,8 +81,93 @@ impl StatisticsRecorder { 0.0 }; + StatisticsRecorder { + timestamp_period, + features, + current_frame: FrameData::new(device, features), + submitted_frames: Vec::new(), + finished_frames: Vec::new(), + } + } + + /// Begins recording statistics for a new frame. + pub fn begin_frame(&mut self) { + let mut idx = 0; + while idx < self.submitted_frames.len() { + if self.submitted_frames[idx].run_mapped_callback(self.timestamp_period) { + self.finished_frames + .push(self.submitted_frames.swap_remove(idx)); + } else { + idx += 1; + } + } + + dbg!(self.finished_frames.len() + self.submitted_frames.len() + 1); + + self.current_frame.begin(); + } + + fn begin_render_pass(&mut self, pass: &mut RenderPass, name: &str) { + self.current_frame.begin_render_pass(pass, name); + } + + fn end_render_pass(&mut self, pass: &mut RenderPass, name: &str) { + self.current_frame.end_render_pass(pass, name); + } + + /// Copies data from [`QuerySet`]'s to a [`Buffer`], after which it can be downloaded to CPU. + /// + /// Should be called before [`StatisticsRecorder::finish_frame`] + pub fn resolve(&mut self, encoder: &mut CommandEncoder) { + self.current_frame.resolve(encoder); + } + + /// Finishes recording statistics for the current frame. + /// + /// The specified `callback` will be invoked when statistics become available. + /// + /// Should be called after [`StatisticsRecorder::resolve`], + /// and **after** all commands buffers have been queued. + pub fn finish_frame( + &mut self, + device: &RenderDevice, + callback: impl FnOnce(RenderStatistics) + Send + Sync + 'static, + ) { + self.current_frame.finish(callback); + + // reuse one of the finished frames, if we can + let new_frame = match self.finished_frames.pop() { + Some(frame) => frame, + None => FrameData::new(device, self.features), + }; + + let old_frame = std::mem::replace(&mut self.current_frame, new_frame); + self.submitted_frames.push(old_frame); + } +} + +struct FrameData { + timestamps_query_set: Option, + num_timestamps: u32, + pipeline_statistics_query_set: Option, + num_pipeline_statistics: u32, + buffer_size: u64, + pipeline_statistics_buffer_offset: u64, + resolve_buffer: Option, + read_buffer: Option, + pass_records: HashMap, + is_mapped: Arc, + callback: Option>, +} + +impl FrameData { + fn new(device: &RenderDevice, features: Features) -> FrameData { + let wgpu_device = device.wgpu_device(); + let mut buffer_size = 0; + let timestamps_query_set = if features.contains(Features::TIMESTAMP_QUERY_INSIDE_PASSES) { - Some(device.wgpu_device().create_query_set(&QuerySetDescriptor { + buffer_size += u64::from(MAX_TIMESTAMP_QUERIES) * TIMESTAMP_SIZE; + Some(wgpu_device.create_query_set(&QuerySetDescriptor { label: Some("timestamps_query_set"), ty: QueryType::Timestamp, count: MAX_TIMESTAMP_QUERIES, @@ -86,9 +176,12 @@ impl StatisticsRecorder { None }; + let pipeline_statistics_buffer_offset = buffer_size; + let pipeline_statistics_query_set = if features.contains(Features::PIPELINE_STATISTICS_QUERY) { - Some(device.wgpu_device().create_query_set(&QuerySetDescriptor { + buffer_size += u64::from(MAX_PIPELINE_STATISTICS) * PIPELINE_STATISTICS_SIZE; + Some(wgpu_device.create_query_set(&QuerySetDescriptor { label: Some("pipeline_statistics_query_set"), ty: QueryType::PipelineStatistics(PipelineStatisticsTypes::all()), count: MAX_PIPELINE_STATISTICS, @@ -97,23 +190,43 @@ impl StatisticsRecorder { None }; - StatisticsRecorder { - timestamp_period, + let (resolve_buffer, read_buffer) = if buffer_size > 0 { + let resolve_buffer = wgpu_device.create_buffer(&BufferDescriptor { + label: Some("render_statistics_resolve_buffer"), + size: buffer_size, + usage: BufferUsages::QUERY_RESOLVE | BufferUsages::COPY_SRC, + mapped_at_creation: false, + }); + let read_buffer = wgpu_device.create_buffer(&BufferDescriptor { + label: Some("render_statistics_read_buffer"), + size: buffer_size, + usage: BufferUsages::COPY_DST | BufferUsages::MAP_READ, + mapped_at_creation: false, + }); + (Some(resolve_buffer), Some(read_buffer)) + } else { + (None, None) + }; + + FrameData { timestamps_query_set, num_timestamps: 0, pipeline_statistics_query_set, num_pipeline_statistics: 0, + buffer_size, + pipeline_statistics_buffer_offset, + resolve_buffer, + read_buffer, pass_records: HashMap::default(), - buffer: None, + is_mapped: Arc::new(AtomicBool::new(false)), + callback: None, } } - /// Begins recording statistics for a new frame. - pub fn begin_frame(&mut self) { + fn begin(&mut self) { self.num_timestamps = 0; self.num_pipeline_statistics = 0; self.pass_records.clear(); - self.buffer = None; } fn pass_record(&mut self, name: &str) -> &mut PassRecord { @@ -170,39 +283,15 @@ impl StatisticsRecorder { record.end_instant = Some(Instant::now()); } - fn buffer_size(&self) -> (u64, u64) { - // timestamps are stored as u64 - let mut buffer_size = u64::from(self.num_timestamps) * 8; - if buffer_size % 256 != 0 { - buffer_size = buffer_size + 256 - buffer_size % 256; - } - - let pipeline_statistics_offset = buffer_size; - - // pipeline statistics are stored as [u64; 5] - buffer_size += u64::from(self.num_pipeline_statistics) * 40; - - (buffer_size, pipeline_statistics_offset) - } - - /// Copies data from [`QuerySet`]'s to a buffer, after which it can be downloaded to CPU. - pub fn resolve(&mut self, encoder: &mut CommandEncoder, device: &RenderDevice) { - if self.timestamps_query_set.is_none() && self.pipeline_statistics_query_set.is_none() { + fn resolve(&mut self, encoder: &mut CommandEncoder) { + let (Some(resolve_buffer), Some(read_buffer)) = (&self.resolve_buffer, &self.read_buffer) + else { return; - } - - let (buffer_size, pipeline_statistics_offset) = self.buffer_size(); - - let buffer = device.wgpu_device().create_buffer(&BufferDescriptor { - label: Some("download_statistics_bufer"), - size: buffer_size, - usage: BufferUsages::COPY_SRC | BufferUsages::QUERY_RESOLVE, - mapped_at_creation: false, - }); + }; match &self.timestamps_query_set { Some(set) if self.num_timestamps > 0 => { - encoder.resolve_query_set(set, 0..self.num_timestamps, &buffer, 0); + encoder.resolve_query_set(set, 0..self.num_timestamps, resolve_buffer, 0); } _ => {} } @@ -212,97 +301,109 @@ impl StatisticsRecorder { encoder.resolve_query_set( set, 0..self.num_pipeline_statistics, - &buffer, - pipeline_statistics_offset, + resolve_buffer, + self.pipeline_statistics_buffer_offset, ); } _ => {} } - self.buffer = Some(buffer); + encoder.copy_buffer_to_buffer(resolve_buffer, 0, read_buffer, 0, self.buffer_size); } - /// Downloads the statistics from GPU, asynchronously calling the callback when the data is available. - pub fn download( - &mut self, - device: &RenderDevice, - queue: &Queue, - callback: impl FnOnce(RenderStatistics) + Send + 'static, - ) { - let (_, pipeline_statistics_offset) = self.buffer_size(); - let timestamp_period = self.timestamp_period; - let num_timestamps = self.num_timestamps; - let num_pipeline_statistics = self.num_pipeline_statistics; - let pass_records = std::mem::take(&mut self.pass_records); - - let Some(buffer) = &self.buffer else { + fn finish(&mut self, callback: impl FnOnce(RenderStatistics) + Send + Sync + 'static) { + let Some(read_buffer) = &self.read_buffer else { // we still have cpu timings, so let's use them - let statistics = pass_records.into_iter().map(|(name, record)| { + let statistics = self.pass_records.iter().map(|(name, record)| { let mut statistics = RenderPassStatistics::default(); if let (Some(begin), Some(end)) = (record.begin_instant, record.end_instant) { statistics.elapsed_cpu = Some(end - begin); } - (name, statistics) + (name.clone(), statistics) }); callback(RenderStatistics(statistics.collect())); return; }; - DownloadBuffer::read_buffer(device.wgpu_device(), queue, &buffer.slice(..), move |res| { - let buffer = match res { - Ok(v) => v, - Err(e) => { - bevy_log::warn!("Failed to download render statistics buffer: {e}"); - return; - } - }; + self.callback = Some(Box::new(callback)); - let timestamps = buffer[..(num_timestamps * 8) as usize] - .chunks(8) - .map(|v| u64::from_ne_bytes(v.try_into().unwrap())) - .collect::>(); + let is_mapped = self.is_mapped.clone(); + read_buffer.slice(..).map_async(MapMode::Read, move |res| { + if let Err(e) = res { + bevy_log::warn!("Failed to download render statistics buffer: {e}"); + return; + } - let start = pipeline_statistics_offset as usize; - let len = (num_pipeline_statistics as usize) * 40; - let pipeline_statistics = buffer[start..start + len] - .chunks(8) - .map(|v| u64::from_ne_bytes(v.try_into().unwrap())) - .collect::>(); + is_mapped.store(true, Ordering::Release); + }); + } - let statistics = pass_records.into_iter().map(|(name, record)| { - let mut statistics = RenderPassStatistics::default(); + // returns true if the frame is considered finished, false otherwise + fn run_mapped_callback(&mut self, timestamp_period: f32) -> bool { + let Some(read_buffer) = &self.read_buffer else { + return true; + }; + if !self.is_mapped.load(Ordering::Acquire) { + // need to wait more + return false; + } + let Some(callback) = self.callback.take() else { + return true; + }; - if let (Some(begin), Some(end)) = (record.begin_instant, record.end_instant) { - statistics.elapsed_cpu = Some(end - begin); - } + let data = read_buffer.slice(..).get_mapped_range(); - if let (Some(begin), Some(end)) = - (record.begin_timestamp_index, record.end_timestamp_index) - { - let begin = timestamps[begin as usize] as f64; - let end = timestamps[end as usize] as f64; - let nanos = ((end - begin) * (timestamp_period as f64)).round() as u64; - statistics.elapsed_gpu = Some(Duration::from_nanos(nanos)); - } + let timestamps = data[..(self.num_timestamps * 8) as usize] + .chunks(8) + .map(|v| u64::from_ne_bytes(v.try_into().unwrap())) + .collect::>(); - if let Some(index) = record.pipeline_statistics_index { - let index = (index as usize) * 5; - statistics.vertex_shader_invocations = Some(pipeline_statistics[index]); - statistics.clipper_invocations = Some(pipeline_statistics[index + 1]); - statistics.clipper_primitives_out = Some(pipeline_statistics[index + 2]); - statistics.fragment_shader_invocations = Some(pipeline_statistics[index + 3]); - statistics.compute_shader_invocations = Some(pipeline_statistics[index + 4]); - } + let start = self.pipeline_statistics_buffer_offset as usize; + let len = (self.num_pipeline_statistics as usize) * 40; + let pipeline_statistics = data[start..start + len] + .chunks(8) + .map(|v| u64::from_ne_bytes(v.try_into().unwrap())) + .collect::>(); - (name, statistics) - }); + let statistics = self.pass_records.iter().map(|(name, record)| { + let mut statistics = RenderPassStatistics::default(); - callback(RenderStatistics(statistics.collect())); + if let (Some(begin), Some(end)) = (record.begin_instant, record.end_instant) { + statistics.elapsed_cpu = Some(end - begin); + } + + if let (Some(begin), Some(end)) = + (record.begin_timestamp_index, record.end_timestamp_index) + { + let begin = timestamps[begin as usize] as f64; + let end = timestamps[end as usize] as f64; + let nanos = ((end - begin) * (timestamp_period as f64)).round() as u64; + statistics.elapsed_gpu = Some(Duration::from_nanos(nanos)); + } + + if let Some(index) = record.pipeline_statistics_index { + let index = (index as usize) * 5; + statistics.vertex_shader_invocations = Some(pipeline_statistics[index]); + statistics.clipper_invocations = Some(pipeline_statistics[index + 1]); + statistics.clipper_primitives_out = Some(pipeline_statistics[index + 2]); + statistics.fragment_shader_invocations = Some(pipeline_statistics[index + 3]); + statistics.compute_shader_invocations = Some(pipeline_statistics[index + 4]); + } + + (name.clone(), statistics) }); + + callback(RenderStatistics(statistics.collect())); + + drop(data); + read_buffer.unmap(); + self.is_mapped.store(false, Ordering::Release); + + true } }