Embed strides into shader

Embed the element stride of various data structures directly into the `vfx_indirect` shader, instead of passing them at runtime through the `SpawnerParams` uniform buffer. Those strides are constants per GPU device, so cannot change after startup.
djeedai · May 19, 2024 · ffda9de · ffda9de
1 parent a80db80
commit ffda9de
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 51 deletions.
diff --git a/src/render/mod.rs b/src/render/mod.rs
@@ -47,7 +47,7 @@ use crate::{
     },
     spawn::EffectSpawner,
     CompiledParticleEffect, EffectProperties, EffectShader, EffectSimulation, HanabiPlugin,
-    ParticleLayout, PropertyLayout, RemovedEffectsEvent, SimulationCondition,
+    ParticleLayout, PropertyLayout, RemovedEffectsEvent, SimulationCondition, ToWgslString,
 };
 
 mod aligned_buffer_vec;
@@ -119,17 +119,6 @@ struct GpuSimParams {
     ///
     /// This is only used by the `vfx_indirect` compute shader.
     num_groups: u32,
-    /// Stride in bytes of an effect render indirect block, used to index the
-    /// effect's block based on its index.
-    render_effect_stride: u32,
-    /// Stride in bytes of a group render indirect block, used to index the
-    /// effect's block based on its index.
-    render_group_stride: u32,
-    /// Stride in bytes of a dispatch indirect block, used to index the effect's
-    /// block based on its index.
-    ///
-    /// This is only used by the `vfx_indirect` compute shader.
-    dispatch_stride: u32,
 }
 
 impl Default for GpuSimParams {
@@ -142,9 +131,6 @@ impl Default for GpuSimParams {
             real_delta_time: 0.04,
             real_time: 0.0,
             num_groups: 0,
-            render_effect_stride: 0, // invalid
-            render_group_stride: 0,  // invalid
-            dispatch_stride: 0,      // invalid
         }
     }
 }
@@ -444,7 +430,15 @@ impl FromWorld for DispatchIndirectPipeline {
             push_constant_ranges: &[],
         });
 
-        let indirect_code = include_str!("vfx_indirect.wgsl");
+
+
+        let render_effect_indirect_stride_code = (render_effect_indirect_size.get() as u32).to_wgsl_string();
+        let render_group_indirect_stride_code = (render_group_indirect_size.get() as u32).to_wgsl_string();
+        let dispatch_indirect_stride_code = (dispatch_indirect_size.get() as u32).to_wgsl_string();
+        let indirect_code = include_str!("vfx_indirect.wgsl")
+            .replace("{{RENDER_EFFECT_INDIRECT_STRIDE}}", &render_effect_indirect_stride_code)
+            .replace("{{RENDER_GROUP_INDIRECT_STRIDE}}", &render_group_indirect_stride_code)
+            .replace("{{DISPATCH_INDIRECT_STRIDE}}", &dispatch_indirect_stride_code);
 
         // Resolve imports. Because we don't insert this shader into Bevy' pipeline
         // cache, we don't get that part "for free", so we have to do it manually here.
@@ -469,7 +463,7 @@ impl FromWorld for DispatchIndirectPipeline {
             let shader_defs = default();
 
             match composer.make_naga_module(NagaModuleDescriptor {
-                source: indirect_code,
+                source: &indirect_code,
                 file_path: "vfx_indirect.wgsl",
                 shader_defs,
                 ..Default::default()
@@ -2283,41 +2277,22 @@ pub(crate) fn prepare_effects(
         .sim_params_uniforms
         .set(GpuSimParams::default());
     {
-        let storage_align = effects_meta.gpu_limits.storage_buffer_align().get() as usize;
-        let render_effect_stride =
-            effects_meta.gpu_limits.render_effect_indirect_size().get() as u32;
-        let render_group_stride = effects_meta.gpu_limits.render_group_indirect_size().get() as u32;
-
         let gpu_sim_params = effects_meta.sim_params_uniforms.get_mut();
         let sim_params = *sim_params;
         *gpu_sim_params = sim_params.into();
 
         gpu_sim_params.num_groups = total_group_count;
 
-        // FIXME - Those are shader compile time constants, which only change with the
-        // GPU adapter limits (so, fixed while the app runs). Stop wasting uniform
-        // storage and hardcode into shader instead.
-        gpu_sim_params.render_effect_stride = render_effect_stride;
-        gpu_sim_params.render_group_stride = render_group_stride;
-        gpu_sim_params.dispatch_stride = next_multiple_of(
-            GpuDispatchIndirect::min_size().get() as usize,
-            storage_align,
-        ) as u32;
-
         trace!(
             "Simulation parameters: time={} delta_time={} virtual_time={} \
-                virtual_delta_time={} real_time={} real_delta_time={} num_groups={} \
-                render_effect_stride={} render_group_stride={} dispatch_stride={}",
+                virtual_delta_time={} real_time={} real_delta_time={} num_groups={}",
             gpu_sim_params.time,
             gpu_sim_params.delta_time,
             gpu_sim_params.virtual_time,
             gpu_sim_params.virtual_delta_time,
             gpu_sim_params.real_time,
             gpu_sim_params.real_delta_time,
             gpu_sim_params.num_groups,
-            gpu_sim_params.render_effect_stride,
-            gpu_sim_params.render_group_stride,
-            gpu_sim_params.dispatch_stride,
         );
     }
     // FIXME - There's no simple way to tell if write_buffer() reallocates...

diff --git a/src/render/vfx_common.wgsl b/src/render/vfx_common.wgsl
@@ -13,19 +13,8 @@ struct SimParams {
     real_delta_time: f32,
     /// Real time in seconds since the start of simulation.
     real_time: f32,
-//#ifdef SIM_PARAMS_INDIRECT_DATA
     /// Number of groups batched together.
     num_groups: u32,
-    /// Stride in bytes of the RenderEffectMetadata struct. Used to calculate
-    /// the position of each effect's data into the buffer of a batch.
-    render_effect_stride: u32,
-    /// Stride in bytes of the RenderGroupIndirect struct. Used to calculate
-    /// the position of each effect's data into the buffer of a batch.
-    render_group_stride: u32,
-    /// Stride in bytes of the DispatchIndirect struct. Used to calculate
-    /// the position of each effect's data into the buffer of a batch.
-    dispatch_stride: u32,
-//#endif
 }
 
 struct Spawner {

diff --git a/src/render/vfx_indirect.wgsl b/src/render/vfx_indirect.wgsl
@@ -5,6 +5,10 @@
     REM_OFFSET_MAX_SPAWN, RGI_OFFSET_INSTANCE_COUNT, REM_OFFSET_PING
 }
 
+const RENDER_EFFECT_INDIRECT_STRIDE: u32 = {{RENDER_EFFECT_INDIRECT_STRIDE}} / 4u;
+const RENDER_GROUP_INDIRECT_STRIDE: u32 = {{RENDER_GROUP_INDIRECT_STRIDE}} / 4u;
+const DISPATCH_INDIRECT_STRIDE: u32 = {{DISPATCH_INDIRECT_STRIDE}} / 4u;
+
 @group(0) @binding(0) var<storage, read_write> render_effect_indirect_buffer : array<u32>;
 @group(0) @binding(1) var<storage, read_write> render_group_indirect_buffer : array<u32>;
 @group(0) @binding(2) var<storage, read_write> dispatch_indirect_buffer : array<u32>;
@@ -33,8 +37,8 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
 
     // Calculate the base offset (in number of u32 items) into the render indirect and
     // dispatch indirect arrays.
-    let rgi_base = sim_params.render_group_stride * group_index / 4u;
-    let di_base = sim_params.dispatch_stride * group_index / 4u;
+    let rgi_base = RENDER_GROUP_INDIRECT_STRIDE * group_index;
+    let di_base = DISPATCH_INDIRECT_STRIDE * group_index;
 
     // Clear the rendering instance count, which will be upgraded by the update pass
     // with the particles actually alive at the end of their update (after aged).
@@ -52,7 +56,7 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
     render_group_indirect_buffer[rgi_base + RGI_OFFSET_MAX_UPDATE] = alive_count;
 
     if (is_first_group) {
-        let rem_base = sim_params.render_effect_stride * effect_index / 4u;
+        let rem_base = RENDER_EFFECT_INDIRECT_STRIDE * effect_index;
 
         // Copy the number of dead particles to a constant location, so that the
         // init pass on next frame can atomically modify dead_count in parallel