Skip to content

Commit

Permalink
Embed strides into shader
Browse files Browse the repository at this point in the history
Embed the element stride of various data structures directly into the
`vfx_indirect` shader, instead of passing them at runtime through the
`SpawnerParams` uniform buffer. Those strides are constants per GPU device, so
cannot change after startup.
  • Loading branch information
djeedai committed May 19, 2024
1 parent a80db80 commit ffda9de
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 51 deletions.
49 changes: 12 additions & 37 deletions src/render/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ use crate::{
},
spawn::EffectSpawner,
CompiledParticleEffect, EffectProperties, EffectShader, EffectSimulation, HanabiPlugin,
ParticleLayout, PropertyLayout, RemovedEffectsEvent, SimulationCondition,
ParticleLayout, PropertyLayout, RemovedEffectsEvent, SimulationCondition, ToWgslString,
};

mod aligned_buffer_vec;
Expand Down Expand Up @@ -119,17 +119,6 @@ struct GpuSimParams {
///
/// This is only used by the `vfx_indirect` compute shader.
num_groups: u32,
/// Stride in bytes of an effect render indirect block, used to index the
/// effect's block based on its index.
render_effect_stride: u32,
/// Stride in bytes of a group render indirect block, used to index the
/// effect's block based on its index.
render_group_stride: u32,
/// Stride in bytes of a dispatch indirect block, used to index the effect's
/// block based on its index.
///
/// This is only used by the `vfx_indirect` compute shader.
dispatch_stride: u32,
}

impl Default for GpuSimParams {
Expand All @@ -142,9 +131,6 @@ impl Default for GpuSimParams {
real_delta_time: 0.04,
real_time: 0.0,
num_groups: 0,
render_effect_stride: 0, // invalid
render_group_stride: 0, // invalid
dispatch_stride: 0, // invalid
}
}
}
Expand Down Expand Up @@ -444,7 +430,15 @@ impl FromWorld for DispatchIndirectPipeline {
push_constant_ranges: &[],
});

let indirect_code = include_str!("vfx_indirect.wgsl");


let render_effect_indirect_stride_code = (render_effect_indirect_size.get() as u32).to_wgsl_string();
let render_group_indirect_stride_code = (render_group_indirect_size.get() as u32).to_wgsl_string();
let dispatch_indirect_stride_code = (dispatch_indirect_size.get() as u32).to_wgsl_string();
let indirect_code = include_str!("vfx_indirect.wgsl")
.replace("{{RENDER_EFFECT_INDIRECT_STRIDE}}", &render_effect_indirect_stride_code)
.replace("{{RENDER_GROUP_INDIRECT_STRIDE}}", &render_group_indirect_stride_code)
.replace("{{DISPATCH_INDIRECT_STRIDE}}", &dispatch_indirect_stride_code);

// Resolve imports. Because we don't insert this shader into Bevy' pipeline
// cache, we don't get that part "for free", so we have to do it manually here.
Expand All @@ -469,7 +463,7 @@ impl FromWorld for DispatchIndirectPipeline {
let shader_defs = default();

match composer.make_naga_module(NagaModuleDescriptor {
source: indirect_code,
source: &indirect_code,
file_path: "vfx_indirect.wgsl",
shader_defs,
..Default::default()
Expand Down Expand Up @@ -2283,41 +2277,22 @@ pub(crate) fn prepare_effects(
.sim_params_uniforms
.set(GpuSimParams::default());
{
let storage_align = effects_meta.gpu_limits.storage_buffer_align().get() as usize;
let render_effect_stride =
effects_meta.gpu_limits.render_effect_indirect_size().get() as u32;
let render_group_stride = effects_meta.gpu_limits.render_group_indirect_size().get() as u32;

let gpu_sim_params = effects_meta.sim_params_uniforms.get_mut();
let sim_params = *sim_params;
*gpu_sim_params = sim_params.into();

gpu_sim_params.num_groups = total_group_count;

// FIXME - Those are shader compile time constants, which only change with the
// GPU adapter limits (so, fixed while the app runs). Stop wasting uniform
// storage and hardcode into shader instead.
gpu_sim_params.render_effect_stride = render_effect_stride;
gpu_sim_params.render_group_stride = render_group_stride;
gpu_sim_params.dispatch_stride = next_multiple_of(
GpuDispatchIndirect::min_size().get() as usize,
storage_align,
) as u32;

trace!(
"Simulation parameters: time={} delta_time={} virtual_time={} \
virtual_delta_time={} real_time={} real_delta_time={} num_groups={} \
render_effect_stride={} render_group_stride={} dispatch_stride={}",
virtual_delta_time={} real_time={} real_delta_time={} num_groups={}",
gpu_sim_params.time,
gpu_sim_params.delta_time,
gpu_sim_params.virtual_time,
gpu_sim_params.virtual_delta_time,
gpu_sim_params.real_time,
gpu_sim_params.real_delta_time,
gpu_sim_params.num_groups,
gpu_sim_params.render_effect_stride,
gpu_sim_params.render_group_stride,
gpu_sim_params.dispatch_stride,
);
}
// FIXME - There's no simple way to tell if write_buffer() reallocates...
Expand Down
11 changes: 0 additions & 11 deletions src/render/vfx_common.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,8 @@ struct SimParams {
real_delta_time: f32,
/// Real time in seconds since the start of simulation.
real_time: f32,
//#ifdef SIM_PARAMS_INDIRECT_DATA
/// Number of groups batched together.
num_groups: u32,
/// Stride in bytes of the RenderEffectMetadata struct. Used to calculate
/// the position of each effect's data into the buffer of a batch.
render_effect_stride: u32,
/// Stride in bytes of the RenderGroupIndirect struct. Used to calculate
/// the position of each effect's data into the buffer of a batch.
render_group_stride: u32,
/// Stride in bytes of the DispatchIndirect struct. Used to calculate
/// the position of each effect's data into the buffer of a batch.
dispatch_stride: u32,
//#endif
}

struct Spawner {
Expand Down
10 changes: 7 additions & 3 deletions src/render/vfx_indirect.wgsl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
REM_OFFSET_MAX_SPAWN, RGI_OFFSET_INSTANCE_COUNT, REM_OFFSET_PING
}

const RENDER_EFFECT_INDIRECT_STRIDE: u32 = {{RENDER_EFFECT_INDIRECT_STRIDE}} / 4u;
const RENDER_GROUP_INDIRECT_STRIDE: u32 = {{RENDER_GROUP_INDIRECT_STRIDE}} / 4u;
const DISPATCH_INDIRECT_STRIDE: u32 = {{DISPATCH_INDIRECT_STRIDE}} / 4u;

@group(0) @binding(0) var<storage, read_write> render_effect_indirect_buffer : array<u32>;
@group(0) @binding(1) var<storage, read_write> render_group_indirect_buffer : array<u32>;
@group(0) @binding(2) var<storage, read_write> dispatch_indirect_buffer : array<u32>;
Expand Down Expand Up @@ -33,8 +37,8 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {

// Calculate the base offset (in number of u32 items) into the render indirect and
// dispatch indirect arrays.
let rgi_base = sim_params.render_group_stride * group_index / 4u;
let di_base = sim_params.dispatch_stride * group_index / 4u;
let rgi_base = RENDER_GROUP_INDIRECT_STRIDE * group_index;
let di_base = DISPATCH_INDIRECT_STRIDE * group_index;

// Clear the rendering instance count, which will be upgraded by the update pass
// with the particles actually alive at the end of their update (after aged).
Expand All @@ -52,7 +56,7 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
render_group_indirect_buffer[rgi_base + RGI_OFFSET_MAX_UPDATE] = alive_count;

if (is_first_group) {
let rem_base = sim_params.render_effect_stride * effect_index / 4u;
let rem_base = RENDER_EFFECT_INDIRECT_STRIDE * effect_index;

// Copy the number of dead particles to a constant location, so that the
// init pass on next frame can atomically modify dead_count in parallel
Expand Down

0 comments on commit ffda9de

Please sign in to comment.