From 626e86ed65939f82b4da69c33368641386f58152 Mon Sep 17 00:00:00 2001 From: David Maas Date: Mon, 24 Jul 2023 15:21:21 -0500 Subject: [PATCH] Implemented particle sorting. --- THIRD-PARTY-NOTICES.md | 29 +++ ThreeL/BitonicSort.cpp | 199 ++++++++++++++++++ ThreeL/BitonicSort.h | 69 ++++++ ThreeL/MathCommon.h | 30 +++ ThreeL/ParticleSystem.cpp | 45 +++- ThreeL/ParticleSystem.h | 3 + ThreeL/ResourceManager.cpp | 3 + ThreeL/ResourceManager.h | 5 +- ThreeL/ShaderInterop.h | 2 + ThreeL/Shaders/ParticleRender.hlsl | 5 +- ThreeL/Shaders/ParticleSystem.cs.hlsl | 8 +- ThreeL/ThreeL.props | 1 + ThreeL/ThreeL.vcxproj | 7 + ThreeL/ThreeL.vcxproj.filters | 20 ++ ThreeL/UavCounter.cpp | 16 +- ThreeL/UavCounter.h | 2 + external/BitonicSort/BitonicInnerSort.cs.hlsl | 114 ++++++++++ external/BitonicSort/BitonicOuterSort.cs.hlsl | 55 +++++ external/BitonicSort/BitonicPreSort.cs.hlsl | 128 +++++++++++ .../BitonicPrepareIndirectArgs.cs.hlsl | 62 ++++++ external/BitonicSort/BitonicSortCommon.hlsli | 59 ++++++ external/README.md | 29 +++ tooling/Common.targets | 3 +- 23 files changed, 888 insertions(+), 6 deletions(-) create mode 100644 ThreeL/BitonicSort.cpp create mode 100644 ThreeL/BitonicSort.h create mode 100644 external/BitonicSort/BitonicInnerSort.cs.hlsl create mode 100644 external/BitonicSort/BitonicOuterSort.cs.hlsl create mode 100644 external/BitonicSort/BitonicPreSort.cs.hlsl create mode 100644 external/BitonicSort/BitonicPrepareIndirectArgs.cs.hlsl create mode 100644 external/BitonicSort/BitonicSortCommon.hlsli create mode 100644 external/README.md diff --git a/THIRD-PARTY-NOTICES.md b/THIRD-PARTY-NOTICES.md index 8d1686e..6bc1fb8 100644 --- a/THIRD-PARTY-NOTICES.md +++ b/THIRD-PARTY-NOTICES.md @@ -7,6 +7,7 @@ ThreeL incorporates third-party libraries and assets licensed as follows. - [DirectX Shader Compiler](#directx-shader-compiler) - [Kenney Particle Pack](#kenney-particle-pack) - [JSON for Modern C++](#json-for-modern-c) +- [MiniEngine Bitonic Sort](#miniengine-bitonic-sort) - [Sponza](#sponza) - [stb](#stb) - [TinyGLTF](#tinygltf) @@ -342,6 +343,34 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ``` +# MiniEngine Bitonic Sort + +https://github.com/microsoft/DirectX-Graphics-Samples/tree/b5f92e2251ee83db4d4c795b3cba5d470c52eaf8/MiniEngine + +``` +The MIT License (MIT) + +Copyright (c) 2013-2015 Microsoft + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` + # Sponza https://github.com/KhronosGroup/glTF-Sample-Models/tree/189f80d7d44f76d8f9be8e337d4c6cb85ef521a4/2.0/Sponza diff --git a/ThreeL/BitonicSort.cpp b/ThreeL/BitonicSort.cpp new file mode 100644 index 0000000..f4c16c2 --- /dev/null +++ b/ThreeL/BitonicSort.cpp @@ -0,0 +1,199 @@ +#include "pch.h" +#include "BitonicSort.h" + +#include "ComputeContext.h" +#include "GpuResource.h" +#include "GraphicsContext.h" +#include "GraphicsCore.h" +#include "HlslCompiler.h" +#include "UavCounter.h" + +BitonicSort::BitonicSort(GraphicsCore& graphics, HlslCompiler& hlslCompiler) +{ + // Compile all shaders + ShaderBlobs prepareIndirectArgs = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicPrepareIndirectArgs.cs.hlsl", L"main", L"cs_6_0"); + ShaderBlobs preSortCombined = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicPreSort.cs.hlsl", L"main", L"cs_6_0"); + ShaderBlobs innerSortCombined = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicInnerSort.cs.hlsl", L"main", L"cs_6_0"); + ShaderBlobs outerSortCombined = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicOuterSort.cs.hlsl", L"main", L"cs_6_0"); + ShaderBlobs preSortSeparate = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicPreSort.cs.hlsl", L"main", L"cs_6_0", { L"BITONICSORT_64BIT" }); + ShaderBlobs innerSortSeparate = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicInnerSort.cs.hlsl", L"main", L"cs_6_0", { L"BITONICSORT_64BIT" }); + ShaderBlobs outerSortSeparate = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicOuterSort.cs.hlsl", L"main", L"cs_6_0", { L"BITONICSORT_64BIT" }); + + // Create root signature and pipeline state objects + m_RootSignature = RootSignature(graphics, prepareIndirectArgs, L"Bitonic Sort Root Signature"); + + D3D12_COMPUTE_PIPELINE_STATE_DESC description = + { + .pRootSignature = m_RootSignature.Get(), + .CS = prepareIndirectArgs.ShaderBytecode(), + }; + m_PrepareIndirectArgs = PipelineStateObject(graphics, description, L"Bitonic Sort Prepare Indirect Args"); + description.CS = preSortCombined.ShaderBytecode(); + m_PreSortCombined = PipelineStateObject(graphics, description, L"Bitonic Pre-Sort (Combined)"); + description.CS = innerSortCombined.ShaderBytecode(); + m_InnerSortCombined = PipelineStateObject(graphics, description, L"Bitonic Inner Sort (Combined)"); + description.CS = outerSortCombined.ShaderBytecode(); + m_OuterSortCombined = PipelineStateObject(graphics, description, L"Bitonic Outer Sort (Combined)"); + description.CS = preSortSeparate.ShaderBytecode(); + m_PreSortSeparate = PipelineStateObject(graphics, description, L"Bitonic Pre-Sort (Separate)"); + description.CS = innerSortSeparate.ShaderBytecode(); + m_InnerSortSeparate = PipelineStateObject(graphics, description, L"Bitonic Inner Sort (Separate)"); + description.CS = outerSortSeparate.ShaderBytecode(); + m_OuterSortSeparate = PipelineStateObject(graphics, description, L"Bitonic Outer Sort (Separate)"); + + // Create buffers for indirect arguments + D3D12_HEAP_PROPERTIES heapProperties = { D3D12_HEAP_TYPE_DEFAULT }; + const uint32_t elementCount = 22 * 23 / 2; + const uint32_t bufferSize = elementCount * sizeof(D3D12_DISPATCH_ARGUMENTS); + D3D12_RESOURCE_DESC indirectArgumentsDescription = DescribeBufferResource(bufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + for (int i = 0; i < 2; i++) + { + ComPtr indirectArguments; + AssertSuccess(graphics.Device()->CreateCommittedResource + ( + &heapProperties, + D3D12_HEAP_FLAG_CREATE_NOT_ZEROED, + &indirectArgumentsDescription, + D3D12_RESOURCE_STATE_COMMON, + nullptr, + IID_PPV_ARGS(&indirectArguments) + )); + + D3D12_UNORDERED_ACCESS_VIEW_DESC uavDescription = + { + .Format = DXGI_FORMAT_R32_TYPELESS, + .ViewDimension = D3D12_UAV_DIMENSION_BUFFER, + .Buffer = + { + .FirstElement = 0, + .NumElements = bufferSize / sizeof(uint32_t), + .Flags = D3D12_BUFFER_UAV_FLAG_RAW, + }, + }; + ResourceDescriptor indirectArgumentsUav = graphics.ResourceDescriptorManager().CreateUnorderedAccessView(indirectArguments.Get(), nullptr, uavDescription); + + switch (i) + { + case 0: + indirectArguments->SetName(L"Bitonic Sort Indirect Arguments (Graphics Queue)"); + m_GraphicsIndirectArgsBuffer = RawGpuResource(std::move(indirectArguments)); + m_GraphicsIndirectArgsBufferUav = indirectArgumentsUav; + break; + case 1: + indirectArguments->SetName(L"Bitonic Sort Indirect Arguments (Compute Queue)"); + m_ComputeIndirectArgsBuffer = RawGpuResource(std::move(indirectArguments)); + m_ComputeIndirectArgsBufferUav = indirectArgumentsUav; + break; + default: + Fail("Unreachable"); + } + } +} + +namespace BitonicShader +{ + enum RootParameters + { + RpGeneralArgs, + RpCounterBuffer, + RpSortBuffer, + RpIndirectArgs = RpSortBuffer, + RpSortArgs, + }; + + struct SortArgs + { + uint32_t CounterOffset; + uint32_t NullItem; + }; + + struct OuterSortArgs + { + uint32_t k; + uint32_t j; + }; +} + +void BitonicSort::Sort(ComputeContext& context, const BitonicSortParams& params) +{ + Assert(params.Capacity > 1); + + // MiniEngine doesn't assert this in their bitonic sort dispatch, but I'm pretty sure it's requred for sorting ascending to work correctly + // For descending the out of bounds reads will be 0, so + Assert(Math::IsPowerOfTwo(params.Capacity)); + + uint32_t alignedCapacity = Math::AlignPowerOfTwo(params.Capacity); + uint32_t maxIterations = Math::Log2(std::max(2048u, alignedCapacity)) - 10; + + // Select the indirect arguments buffer to use based on the command queue we'll be submitted to + // (We need them to be separate to avoid conflcits between sorts potentially happening concurrently between async compute and graphics queues.) + RawGpuResource& indirectArgsBuffer = context.QueueType() == D3D12_COMMAND_LIST_TYPE_COMPUTE ? m_ComputeIndirectArgsBuffer : m_GraphicsIndirectArgsBuffer; + ResourceDescriptor& indirectArgsUav = context.QueueType() == D3D12_COMMAND_LIST_TYPE_COMPUTE ? m_ComputeIndirectArgsBufferUav : m_GraphicsIndirectArgsBufferUav; + + // Set common root signature arguments + context->SetComputeRootSignature(m_RootSignature); + BitonicShader::SortArgs sortArgs = + { + .CounterOffset = 0, + .NullItem = params.SortAscending ? 0xFFFFFFFF : 0x00000000, + }; + context->SetComputeRoot32BitConstants(BitonicShader::RpSortArgs, sizeof(sortArgs) / sizeof(uint32_t), &sortArgs, 0); + + // Prepare indirect dispatch arguments + context->SetPipelineState(m_PrepareIndirectArgs); + context.TransitionResource(params.ItemCountBuffer, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE); + context.TransitionResource(indirectArgsBuffer, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, true); + context->SetComputeRoot32BitConstant(BitonicShader::RpGeneralArgs, maxIterations, 0); + context->SetComputeRootDescriptorTable(BitonicShader::RpCounterBuffer, params.ItemCountBuffer.Srv().ResidentHandle()); + context->SetComputeRootDescriptorTable(BitonicShader::RpIndirectArgs, indirectArgsUav.ResidentHandle()); + context.Dispatch(1); + + // Pre-sort the list up to k = 2048 + // This will also pad the list with the NullItem determined above so that the rest of the algorithm can operate without caring about the number of items used + //TODO: I don't think the NullItem thing is actually implemented correctly. + // I think the intent was that StoreKeyIndexPair in BitonicPreSort should've been checking the capacity of the list rather than the count. + // Maybe I'm missing something, but isn't the idea that you can skip all the bounds checks in InnerSort/OuterSort? The MiniEngine bitonic sort still checks it all the time. + // (If it didn't it'd end up barfing on SortAscending when the sort buffer's capacity isn't a power of two anyway -- it would be relying on out of bounds UAV accesses on tabled descriptors reading 0.) + // Ah ha, I'm not crazy. The implementation was changed to support non-power-of-two-sized lists and in the process the null padding was broken and made useless. + // https://github.com/microsoft/DirectX-Graphics-Samples/commit/def3a2cb9fb49f3005349a6238662729b16baf68 + // Unfortunately the old implementation has its own problems, and I'm too half awake to implement my own bitoic sort. + // Plus I just want my particles to be sorted. Maybe some other time... + context.TransitionResource(indirectArgsBuffer, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT); + context.TransitionResource(params.SortList, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + context.UavBarrier(params.SortList); + context->SetComputeRootDescriptorTable(BitonicShader::RpSortBuffer, params.SortListUav.ResidentHandle()); + + if (!params.SkipPreSort) + { + context->SetPipelineState(params.ItemKind == BitonicSortParams::CombinedKeyIndex ? m_PreSortCombined : m_PreSortSeparate); + context.DispatchIndirect(indirectArgsBuffer); + context.UavBarrier(params.SortList); + } + + // Pre-sorting took care of swaps for k up to 2048, so now we continue at k = 4096 + // (Note that some of the outer sorts will be skipped by dispatching zero-sized groups as needed once k grows too large) + uint32_t indirectArgsOffset = sizeof(D3D12_DISPATCH_ARGUMENTS); // Start after pre-sort args + for (uint32_t k = 4096; k <= alignedCapacity; k *= 2) + { + // Outer sort iterations -- Swaps for which the distance (j) exceeds the width of the LDS and goes directly through memory + context->SetPipelineState(params.ItemKind == BitonicSortParams::CombinedKeyIndex ? m_OuterSortCombined : m_OuterSortSeparate); + for (uint32_t j = k / 2; j >= 2048; j /= 2) + { + BitonicShader::OuterSortArgs outerArgs = + { + .k = k, + .j = j, + }; + context->SetComputeRoot32BitConstants(BitonicShader::RpGeneralArgs, sizeof(outerArgs) / sizeof(uint32_t), &outerArgs, 0); + context.DispatchIndirect(indirectArgsBuffer, indirectArgsOffset); + indirectArgsOffset += sizeof(D3D12_DISPATCH_ARGUMENTS); + context.UavBarrier(params.SortList); + } + + // Inner sort iteration -- Swaps for which the distance (j) fits within LDS so looping over j occurs within the shader directly + context->SetPipelineState(params.ItemKind == BitonicSortParams::CombinedKeyIndex ? m_InnerSortCombined : m_InnerSortSeparate); + context.DispatchIndirect(indirectArgsBuffer, indirectArgsOffset); + indirectArgsOffset += sizeof(D3D12_DISPATCH_ARGUMENTS); + context.UavBarrier(params.SortList); + } +} diff --git a/ThreeL/BitonicSort.h b/ThreeL/BitonicSort.h new file mode 100644 index 0000000..b249230 --- /dev/null +++ b/ThreeL/BitonicSort.h @@ -0,0 +1,69 @@ +#pragma once +#include "RawGpuResource.h" +#include "ResourceDescriptor.h" +#include "RootSignature.h" +#include "PipelineStateObject.h" + +struct ComputeContext; +class GpuResource; +struct GraphicsContext; +class GraphicsCore; +class HlslCompiler; +class UavCounter; + +struct BitonicSortParams +{ + enum SortItemKind + { + //! The sort key and index have been combined into a single 32-bit value, with the sort key in the upper bits + //! This format results in better sort performance at the expense of smaller indices + CombinedKeyIndex, + //! The sort key and index are separated in a pair of 32-bit values, with the index in X and the sort key in Y + //! This format uses more memory bandwidth and thus results in slower sorting, but you get the full 32-bit range for the indices + SeparateKeyIndex, + }; + + //! The list to be sorted + GpuResource& SortList; + ResourceDescriptor SortListUav; + //! The maximum number of items that SortList can hold + uint32_t Capacity; + //! The format of the elements within SortList + SortItemKind ItemKind; + //! A UavCounter specifying the number of valid entries in SortList + UavCounter& ItemCountBuffer; + + //! If true, the pre-sorting phase will be skipped. + //! The caller asserts that the buffer is already partially sorted in blocks of 2048, meaning that the pre-sorting phase can be skipped. + //! (This might be the case if the sort list was built in chunks of groupshared memory.) + bool SkipPreSort; + + bool SortAscending; +}; + +class BitonicSort +{ +private: + RootSignature m_RootSignature; + + PipelineStateObject m_PrepareIndirectArgs; + + PipelineStateObject m_PreSortCombined; + PipelineStateObject m_InnerSortCombined; + PipelineStateObject m_OuterSortCombined; + + PipelineStateObject m_PreSortSeparate; + PipelineStateObject m_InnerSortSeparate; + PipelineStateObject m_OuterSortSeparate; + + RawGpuResource m_GraphicsIndirectArgsBuffer; + ResourceDescriptor m_GraphicsIndirectArgsBufferUav; + RawGpuResource m_ComputeIndirectArgsBuffer; + ResourceDescriptor m_ComputeIndirectArgsBufferUav; + +public: + BitonicSort() = default; + BitonicSort(GraphicsCore& graphics, HlslCompiler& hlslCompiler); + + void Sort(ComputeContext& context, const BitonicSortParams& params); +}; diff --git a/ThreeL/MathCommon.h b/ThreeL/MathCommon.h index c124248..c14e1e2 100644 --- a/ThreeL/MathCommon.h +++ b/ThreeL/MathCommon.h @@ -34,4 +34,34 @@ namespace Math { return (numerator + denominator - 1) / denominator; } + + inline uint8_t Log2(uint64_t x) + { + DWORD mostSignificantBit; + DWORD leastSignificantBit; + + if (_BitScanReverse64(&mostSignificantBit, x) && _BitScanForward64(&leastSignificantBit, x)) + { + uint8_t result = (uint8_t)mostSignificantBit; + + // If x is not a perfect power of two (IE: multiple bits set) we round up to the next power of two + if (mostSignificantBit != leastSignificantBit) + { result++; } + + return result; + } + + return 0; + } + + template + inline T AlignPowerOfTwo(T x) + { + return x == 0 ? 0 : 1 << Log2(x); + } + + inline bool IsPowerOfTwo(uint32_t x) + { + return x && !(x & (x - 1u)); + } } diff --git a/ThreeL/ParticleSystem.cpp b/ThreeL/ParticleSystem.cpp index 7bdfcc0..1ca329d 100644 --- a/ThreeL/ParticleSystem.cpp +++ b/ThreeL/ParticleSystem.cpp @@ -73,6 +73,36 @@ ParticleSystem::ParticleSystem(ResourceManager& resources, const std::wstring& d spriteBuffer->SetName(std::format(L"'{}' Particle Sprites", debugName).c_str()); m_ParticleSpriteBuffer = RawGpuResource(std::move(spriteBuffer)); + // Allocate sort buffer + uint32_t sortBufferSizeBytes = sizeof(uint2) * m_Capacity; + D3D12_RESOURCE_DESC sortBufferDescription = DescribeBufferResource(sortBufferSizeBytes, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + ComPtr sortBuffer; + AssertSuccess(m_Graphics.Device()->CreateCommittedResource + ( + &heapProperties, + D3D12_HEAP_FLAG_CREATE_NOT_ZEROED, + &sortBufferDescription, + D3D12_RESOURCE_STATE_COMMON, + nullptr, + IID_PPV_ARGS(&sortBuffer) + )); + sortBuffer->SetName(std::format(L"'{}' Particle Sort Buffer", debugName).c_str()); + + D3D12_UNORDERED_ACCESS_VIEW_DESC uavDescription = + { + .Format = DXGI_FORMAT_R32_TYPELESS, + .ViewDimension = D3D12_UAV_DIMENSION_BUFFER, + .Buffer = + { + .FirstElement = 0, + .NumElements = sortBufferSizeBytes / sizeof(uint32_t), + .Flags = D3D12_BUFFER_UAV_FLAG_RAW, + }, + }; + + m_ParticleSpriteSortBufferUav = m_Graphics.ResourceDescriptorManager().CreateUnorderedAccessView(sortBuffer.Get(), nullptr, uavDescription); + m_ParticleSpriteSortBuffer = RawGpuResource(std::move(sortBuffer)); + // Allocate DrawIndirect arguments buffer D3D12_RESOURCE_DESC drawIndirectArgumentsDescription = DescribeBufferResource(sizeof(D3D12_DRAW_ARGUMENTS), D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); ComPtr drawIndirectArguments; @@ -135,6 +165,7 @@ void ParticleSystem::Update(ComputeContext& context, float deltaTime, D3D12_GPU_ context->SetComputeRootUnorderedAccessView(ShaderInterop::ParticleSystem::RpParticleSpritesOut, m_ParticleSpriteBuffer.GpuAddress()); context->SetComputeRootUnorderedAccessView(ShaderInterop::ParticleSystem::RpLivingParticleCountOut, outputStateBuffer.Counter.GpuAddress()); context->SetComputeRootUnorderedAccessView(ShaderInterop::ParticleSystem::RpDrawIndirectArguments, m_DrawIndirectArguments.GpuAddress()); + context->SetComputeRootUnorderedAccessView(ShaderInterop::ParticleSystem::RpParticleSpriteSortBuffer, m_ParticleSpriteSortBuffer.GpuAddress()); // Update existing particles //PERF: For systems with very large capacities that are not near capacity, this ends up spawning a bunch of useless threads @@ -166,11 +197,22 @@ void ParticleSystem::Update(ComputeContext& context, float deltaTime, D3D12_GPU_ context.Dispatch(1); // Sort particle sprites - //TODO + BitonicSortParams sortParams = + { + .SortList = m_ParticleSpriteSortBuffer, + .SortListUav = m_ParticleSpriteSortBufferUav, + .Capacity = m_Capacity, + .ItemKind = BitonicSortParams::SeparateKeyIndex, + .ItemCountBuffer = outputStateBuffer.Counter, + .SkipPreSort = false, + .SortAscending = false, + }; + m_Resources.BitonicSort.Sort(context, sortParams); // Transition all resources for their use in render context.UavBarrier(); context.TransitionResource(m_ParticleSpriteBuffer, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE); + context.TransitionResource(m_ParticleSpriteSortBuffer, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE); context.TransitionResource(m_DrawIndirectArguments, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT); // Update complete, save a sync point for render @@ -191,6 +233,7 @@ void ParticleSystem::Render(GraphicsContext& context, D3D12_GPU_VIRTUAL_ADDRESS context->SetGraphicsRootSignature(m_Resources.ParticleRenderRootSignature); context->SetGraphicsRootShaderResourceView(ShaderInterop::ParticleRender::RpParticleBuffer, m_ParticleSpriteBuffer.GpuAddress()); + context->SetGraphicsRootShaderResourceView(ShaderInterop::ParticleRender::RpSortedParticleLookupBuffer, m_ParticleSpriteSortBuffer.GpuAddress()); context->SetGraphicsRootConstantBufferView(ShaderInterop::ParticleRender::RpPerFrameCb, perFrameCb); context->SetGraphicsRootShaderResourceView(ShaderInterop::ParticleRender::RpMaterialHeap, m_Resources.PbrMaterials.BufferGpuAddress()); diff --git a/ThreeL/ParticleSystem.h b/ThreeL/ParticleSystem.h index cfd513a..4d1bece 100644 --- a/ThreeL/ParticleSystem.h +++ b/ThreeL/ParticleSystem.h @@ -39,6 +39,9 @@ class ParticleSystem RawGpuResource m_ParticleSpriteBuffer; + RawGpuResource m_ParticleSpriteSortBuffer; + ResourceDescriptor m_ParticleSpriteSortBufferUav; + RawGpuResource m_DrawIndirectArguments; public: diff --git a/ThreeL/ResourceManager.cpp b/ThreeL/ResourceManager.cpp index 7ca94e5..61421cc 100644 --- a/ThreeL/ResourceManager.cpp +++ b/ThreeL/ResourceManager.cpp @@ -1,6 +1,7 @@ #include "pch.h" #include "ResourceManager.h" +#include "GraphicsCore.h" #include "HlslCompiler.h" ResourceManager::ResourceManager(GraphicsCore& graphics) @@ -8,6 +9,8 @@ ResourceManager::ResourceManager(GraphicsCore& graphics) { HlslCompiler hlslCompiler; + BitonicSort = ::BitonicSort(Graphics, hlslCompiler); + // Compile all shaders ShaderBlobs pbrVs = hlslCompiler.CompileShader(L"Shaders/Pbr.hlsl", L"VsMain", L"vs_6_0"); ShaderBlobs pbrPs = hlslCompiler.CompileShader(L"Shaders/Pbr.hlsl", L"PsMain", L"ps_6_0"); diff --git a/ThreeL/ResourceManager.h b/ThreeL/ResourceManager.h index a7a30fb..e715734 100644 --- a/ThreeL/ResourceManager.h +++ b/ThreeL/ResourceManager.h @@ -1,10 +1,12 @@ #pragma once -#include "GraphicsCore.h" +#include "BitonicSort.h" #include "MeshHeap.h" #include "PbrMaterialHeap.h" #include "PipelineStateObject.h" #include "RootSignature.h" +class GraphicsCore; + namespace MeshInputSlot { enum MeshInputSlot : UINT @@ -20,6 +22,7 @@ struct ResourceManager GraphicsCore& Graphics; PbrMaterialHeap PbrMaterials; MeshHeap MeshHeap; + BitonicSort BitonicSort; // No complicated PSO management here, we don't need very many so we just make them all by hand RootSignature PbrRootSignature; diff --git a/ThreeL/ShaderInterop.h b/ThreeL/ShaderInterop.h index 08a771a..61b1884 100644 --- a/ThreeL/ShaderInterop.h +++ b/ThreeL/ShaderInterop.h @@ -104,6 +104,7 @@ namespace ShaderInterop enum RootParameters { RpParticleBuffer, + RpSortedParticleLookupBuffer, RpPerFrameCb, RpMaterialHeap, RpLightHeap, @@ -177,6 +178,7 @@ namespace ShaderInterop RpParticleSpritesOut, RpLivingParticleCountOut, RpDrawIndirectArguments, + RpParticleSpriteSortBuffer, }; static const uint32_t SpawnGroupSize = 64; diff --git a/ThreeL/Shaders/ParticleRender.hlsl b/ThreeL/Shaders/ParticleRender.hlsl index 14e7a7e..78b9f57 100644 --- a/ThreeL/Shaders/ParticleRender.hlsl +++ b/ThreeL/Shaders/ParticleRender.hlsl @@ -1,5 +1,6 @@ #define PBR_ROOT_SIGNATURE \ - "SRV(t0, space = 900, flags = DATA_STATIC_WHILE_SET_AT_EXECUTE)," \ + "SRV(t0, space = 900, flags = DATA_STATIC_WHILE_SET_AT_EXECUTE, visibility = SHADER_VISIBILITY_VERTEX)," \ + "SRV(t1, space = 900, flags = DATA_STATIC_WHILE_SET_AT_EXECUTE, visibility = SHADER_VISIBILITY_VERTEX)," \ "CBV(b1)," \ "SRV(t0, flags = DATA_STATIC)," \ "SRV(t1, flags = DATA_STATIC_WHILE_SET_AT_EXECUTE)," \ @@ -21,6 +22,7 @@ #include "Pbr.hlsl" StructuredBuffer g_Particles : register(t0, space900); +ByteAddressBuffer g_SortedParticleLookup : register(t1, space900); //=================================================================================================================================================== // Vertex shader @@ -30,6 +32,7 @@ StructuredBuffer g_Particles : register(t0, space900); PsInput VsMainParticle(uint vertexId : SV_VertexID, uint particleIndex : SV_InstanceID) { PsInput result; + particleIndex = g_SortedParticleLookup.Load(particleIndex * 8); ParticleSprite particle = g_Particles[particleIndex]; result.Uv0 = float2(uint2(vertexId >> 1, vertexId) & 1.xx); diff --git a/ThreeL/Shaders/ParticleSystem.cs.hlsl b/ThreeL/Shaders/ParticleSystem.cs.hlsl index 3bcca6f..1935623 100644 --- a/ThreeL/Shaders/ParticleSystem.cs.hlsl +++ b/ThreeL/Shaders/ParticleSystem.cs.hlsl @@ -10,6 +10,7 @@ RWStructuredBuffer g_ParticleSpritesOut : register(u1, space900) RWByteAddressBuffer g_LivingParticleCountOut : register(u2, space900); RWByteAddressBuffer g_DrawIndirectArguments : register(u3, space900); +RWByteAddressBuffer g_ParticleSpriteSortBuffer : register(u4, space900); struct ParticleSystemParams { @@ -52,6 +53,7 @@ ConstantBuffer g_Params : register(b0, space900); "UAV(u1, space = 900, flags = DATA_VOLATILE)," \ "UAV(u2, space = 900, flags = DATA_VOLATILE)," \ "UAV(u3, space = 900, flags = DATA_VOLATILE)," \ + "UAV(u4, space = 900, flags = DATA_VOLATILE)," \ "" void OutputParticle(uint outputIndex, ParticleState state); @@ -135,9 +137,13 @@ void OutputParticle(uint outputIndex, ParticleState state) g_ParticleStatesOut[outputIndex] = state; // Create a sprite for this particle - //TODO: Emit a sort key //TODO: Culling? g_ParticleSpritesOut[outputIndex] = MakeSprite(state); + + // Emit sort index/key pair + float3 toEye = state.WorldPosition - g_PerFrame.EyePosition; + float distanceSquared = dot(toEye, toEye); + g_ParticleSpriteSortBuffer.Store2(outputIndex * 8, uint2(outputIndex, asuint(distanceSquared))); } [numthreads(1, 1, 1)] diff --git a/ThreeL/ThreeL.props b/ThreeL/ThreeL.props index b6fb6af..0b317c2 100644 --- a/ThreeL/ThreeL.props +++ b/ThreeL/ThreeL.props @@ -42,6 +42,7 @@ --> <_Assets Include="$(ExternalDir)Assets/**" TargetPath="Assets/%(RecursiveDir)/%(Filename)%(Extension)" CopyToOutputDirectory="PreserveNewest" /> + <_Assets Include="$(ExternalDir)BitonicSort/**" TargetPath="Shaders/BitonicSort/%(RecursiveDir)/%(Filename)%(Extension)" CopyToOutputDirectory="PreserveNewest" /> <_Assets Include="$(MSBuildThisFileDirectory)Assets/**" TargetPath="Assets/%(RecursiveDir)/%(Filename)%(Extension)" CopyToOutputDirectory="PreserveNewest" /> <_Assets Include="$(MSBuildThisFileDirectory)Shaders/**" TargetPath="Shaders/%(RecursiveDir)/%(Filename)%(Extension)" CopyToOutputDirectory="PreserveNewest" /> diff --git a/ThreeL/ThreeL.vcxproj b/ThreeL/ThreeL.vcxproj index f9c5303..7a8177d 100644 --- a/ThreeL/ThreeL.vcxproj +++ b/ThreeL/ThreeL.vcxproj @@ -86,6 +86,7 @@ + @@ -158,6 +159,7 @@ + @@ -219,12 +221,17 @@ + + + + + diff --git a/ThreeL/ThreeL.vcxproj.filters b/ThreeL/ThreeL.vcxproj.filters index 97ffbde..8e7fae2 100644 --- a/ThreeL/ThreeL.vcxproj.filters +++ b/ThreeL/ThreeL.vcxproj.filters @@ -90,6 +90,7 @@ + @@ -206,6 +207,7 @@ + @@ -218,6 +220,9 @@ Shaders + + external\BitonicSort + @@ -232,6 +237,9 @@ {2680a101-7374-4cae-9099-cdea269e6f42} + + {2406106f-1795-4877-909f-77751fc56038} + @@ -267,6 +275,18 @@ Shaders + + external\BitonicSort + + + external\BitonicSort + + + external\BitonicSort + + + external\BitonicSort + diff --git a/ThreeL/UavCounter.cpp b/ThreeL/UavCounter.cpp index 6fa7fcc..e514d09 100644 --- a/ThreeL/UavCounter.cpp +++ b/ThreeL/UavCounter.cpp @@ -30,7 +30,21 @@ UavCounter::UavCounter(GraphicsCore& graphics, const std::wstring& debugName) .Flags = D3D12_BUFFER_UAV_FLAG_RAW, }, }; - m_Uav = graphics.ResourceDescriptorManager().CreateUnorderedAccessView(m_Resource.Get(), nullptr, uavDescription);; + m_Uav = graphics.ResourceDescriptorManager().CreateUnorderedAccessView(m_Resource.Get(), nullptr, uavDescription); + + D3D12_SHADER_RESOURCE_VIEW_DESC srvDescription = + { + .Format = DXGI_FORMAT_R32_TYPELESS, + .ViewDimension = D3D12_SRV_DIMENSION_BUFFER, + .Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING, + .Buffer = + { + .FirstElement = 0, + .NumElements = 1, + .Flags = D3D12_BUFFER_SRV_FLAG_RAW, + }, + }; + m_Srv = graphics.ResourceDescriptorManager().CreateShaderResourceView(m_Resource.Get(), srvDescription); m_GpuAddress = m_Resource->GetGPUVirtualAddress(); } diff --git a/ThreeL/UavCounter.h b/ThreeL/UavCounter.h index a7d7c19..0e7cba9 100644 --- a/ThreeL/UavCounter.h +++ b/ThreeL/UavCounter.h @@ -13,6 +13,7 @@ class UavCounter : public GpuResource { private: ResourceDescriptor m_Uav; + ResourceDescriptor m_Srv; D3D12_GPU_VIRTUAL_ADDRESS m_GpuAddress = 0; public: @@ -24,5 +25,6 @@ class UavCounter : public GpuResource inline operator ID3D12Resource* () const { return Resource(); } inline ResourceDescriptor Uav() const { return m_Uav; }; + inline ResourceDescriptor Srv() const { return m_Srv; }; inline D3D12_GPU_VIRTUAL_ADDRESS GpuAddress() const { return m_GpuAddress; }; }; diff --git a/external/BitonicSort/BitonicInnerSort.cs.hlsl b/external/BitonicSort/BitonicInnerSort.cs.hlsl new file mode 100644 index 0000000..3d1b33c --- /dev/null +++ b/external/BitonicSort/BitonicInnerSort.cs.hlsl @@ -0,0 +1,114 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// This code is licensed under the MIT License (MIT). +// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY +// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR +// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. +// +// Developed by Minigraph +// +// Author: James Stanard +// +// Description: The bitonic sort works by sorting groups of size k, +// starting with k=2 and doubling until k>=NumItems. To sort the +// group, keys are compared with a distance of j, which starts at half +// of k and continues halving down to 1. When j is 1024 and less, the +// compare and swap can happen in LDS, and these iterations form the +// "inner sort". Inner sorting happens in LDS and loops. Outer sorting +// happens in memory and does not loop. (Looping happens on the CPU by +// issuing sequential dispatches and barriers.) + + +#include "BitonicSortCommon.hlsli" + +RWByteAddressBuffer g_SortBuffer : register(u0); + +cbuffer Constants : register(b0) +{ + uint k; // k >= 4096 +}; + +#ifdef BITONICSORT_64BIT + +groupshared uint gs_SortKeys[2048]; +groupshared uint gs_SortIndices[2048]; + +void LoadKeyIndexPair( uint Element, uint ListCount ) +{ + uint2 KeyIndex = Element < ListCount ? g_SortBuffer.Load2(Element * 8) : NullItem; + gs_SortIndices[Element & 2047] = KeyIndex.x; + gs_SortKeys[Element & 2047] = KeyIndex.y; +} + +void StoreKeyIndexPair( uint Element, uint ListCount ) +{ + if (Element < ListCount) + g_SortBuffer.Store2(Element * 8, uint2(gs_SortIndices[Element & 2047], gs_SortKeys[Element & 2047])); +} + +#else // 32-bit packed key/index pairs + +groupshared uint gs_SortKeys[2048]; + +void LoadKeyIndexPair( uint Element, uint ListCount ) +{ + gs_SortKeys[Element & 2047] = Element < ListCount ? g_SortBuffer.Load(Element * 4) : NullItem; +} + +void StoreKeyIndexPair( uint Element, uint ListCount ) +{ + if (Element < ListCount) + g_SortBuffer.Store(Element * 4, gs_SortKeys[Element & 2047]); +} + +#endif + +[RootSignature(BitonicSort_RootSig)] +[numthreads(1024, 1, 1)] +void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex ) +{ + const uint ListCount = g_CounterBuffer.Load(CounterOffset); + + // Item index of the start of this group + const uint GroupStart = Gid.x * 2048; + + // Load from memory into LDS to prepare sort + LoadKeyIndexPair(GroupStart + GI, ListCount); + LoadKeyIndexPair(GroupStart + GI + 1024, ListCount); + + GroupMemoryBarrierWithGroupSync(); + + // This is better unrolled because it reduces ALU and because some + // architectures can load/store two LDS items in a single instruction + // as long as their separation is a compile-time constant. + [unroll] + for (uint j = 1024; j > 0; j /= 2) + { + uint Index2 = InsertOneBit(GI, j); + uint Index1 = Index2 ^ j; + + uint A = gs_SortKeys[Index1]; + uint B = gs_SortKeys[Index2]; + + if (ShouldSwap(A, B)) + { + // Swap the keys + gs_SortKeys[Index1] = B; + gs_SortKeys[Index2] = A; + +#ifdef BITONICSORT_64BIT + // Then swap the indices (for 64-bit sorts) + A = gs_SortIndices[Index1]; + B = gs_SortIndices[Index2]; + gs_SortIndices[Index1] = B; + gs_SortIndices[Index2] = A; +#endif + } + + GroupMemoryBarrierWithGroupSync(); + } + + StoreKeyIndexPair(GroupStart + GI, ListCount); + StoreKeyIndexPair(GroupStart + GI + 1024, ListCount); +} diff --git a/external/BitonicSort/BitonicOuterSort.cs.hlsl b/external/BitonicSort/BitonicOuterSort.cs.hlsl new file mode 100644 index 0000000..88db6e2 --- /dev/null +++ b/external/BitonicSort/BitonicOuterSort.cs.hlsl @@ -0,0 +1,55 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// This code is licensed under the MIT License (MIT). +// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY +// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR +// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. +// +// Developed by Minigraph +// +// Author: James Stanard +// + +#include "BitonicSortCommon.hlsli" + +RWByteAddressBuffer g_SortBuffer : register(u0); + +cbuffer Constants : register(b0) +{ + uint k; // k >= 4096 + uint j; // j >= 2048 && j < k +}; + +#ifdef BITONICSORT_64BIT + #define Element uint2 + #define LoadElement(idx) g_SortBuffer.Load2(idx * 8) + #define StoreElement(idx, elem) g_SortBuffer.Store2(idx * 8, elem) +#else + #define Element uint + #define LoadElement(idx) g_SortBuffer.Load(idx * 4) + #define StoreElement(idx, elem) g_SortBuffer.Store(idx * 4, elem) +#endif + +[RootSignature(BitonicSort_RootSig)] +[numthreads(1024, 1, 1)] +void main( uint3 DTid : SV_DispatchThreadID ) +{ + const uint ListCount = g_CounterBuffer.Load(CounterOffset); + + // Form unique index pair from dispatch thread ID + uint Index2 = InsertOneBit(DTid.x, j); + uint Index1 = Index2 ^ (k == 2 * j ? k - 1 : j); + + if (Index2 >= ListCount) + return; + + Element A = LoadElement(Index1); + Element B = LoadElement(Index2); + + if (ShouldSwap(A, B)) + { + StoreElement(Index1, B); + StoreElement(Index2, A); + } +} diff --git a/external/BitonicSort/BitonicPreSort.cs.hlsl b/external/BitonicSort/BitonicPreSort.cs.hlsl new file mode 100644 index 0000000..dbb5eb9 --- /dev/null +++ b/external/BitonicSort/BitonicPreSort.cs.hlsl @@ -0,0 +1,128 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// This code is licensed under the MIT License (MIT). +// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY +// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR +// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. +// +// Developed by Minigraph +// +// Author: James Stanard +// +// Description: A bitonic sort must eventually sort the power-of-two +// ceiling of items. E.g. 391 items -> 512 items. Because of this +// "null items" must be used as padding at the end of the list so that +// they can participate in the sort but remain at the end of the list. +// +// The pre-sort does two things. It appends null items as need, and +// it does the initial sort for k values up to 2048. This is because +// we can run 1024 threads, each of of which can compare and swap two +// elements without contention. And because we can always fit 2048 +// keys & indices in LDS with occupancy greater than one. (A single +// thread group can use as much as 32KB of LDS.) + + +#include "BitonicSortCommon.hlsli" + +RWByteAddressBuffer g_SortBuffer : register(u0); + +#ifdef BITONICSORT_64BIT + +groupshared uint gs_SortIndices[2048]; +groupshared uint gs_SortKeys[2048]; + +void FillSortKey( uint Element, uint ListCount ) +{ + // Unused elements must sort to the end + if (Element < ListCount) + { + uint2 KeyIndexPair = g_SortBuffer.Load2(Element * 8); + gs_SortKeys[Element & 2047] = KeyIndexPair.y; + gs_SortIndices[Element & 2047] = KeyIndexPair.x; + } + else + { + gs_SortKeys[Element & 2047] = NullItem; + } +} + +void StoreKeyIndexPair( uint Element, uint ListCount) +{ + if (Element < ListCount) + g_SortBuffer.Store2(Element * 8, uint2(gs_SortIndices[Element & 2047], gs_SortKeys[Element & 2047])); +} + +#else // 32-bit packed key/index pairs + +groupshared uint gs_SortKeys[2048]; + +void FillSortKey( uint Element, uint ListCount ) +{ + // Unused elements must sort to the end + gs_SortKeys[Element & 2047] = (Element < ListCount ? g_SortBuffer.Load(Element * 4) : NullItem); +} + +void StoreKeyIndexPair( uint Element, uint ListCount ) +{ + if (Element < ListCount) + g_SortBuffer.Store(Element * 4, gs_SortKeys[Element & 2047]); +} + +#endif + +[RootSignature(BitonicSort_RootSig)] +[numthreads(1024, 1, 1)] +void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex ) +{ + // Item index of the start of this group + const uint GroupStart = Gid.x * 2048; + + // Actual number of items that need sorting + const uint ListCount = g_CounterBuffer.Load(CounterOffset); + + FillSortKey(GroupStart + GI, ListCount); + FillSortKey(GroupStart + GI + 1024, ListCount); + + GroupMemoryBarrierWithGroupSync(); + + uint k; + + // This is better unrolled because it reduces ALU and because some + // architectures can load/store two LDS items in a single instruction + // as long as their separation is a compile-time constant. + [unroll] + for (k = 2; k <= 2048; k <<= 1) + { + //[unroll] + for (uint j = k / 2; j > 0; j /= 2) + { + uint Index2 = InsertOneBit(GI, j); + uint Index1 = Index2 ^ (k == 2 * j ? k - 1 : j); + + uint A = gs_SortKeys[Index1]; + uint B = gs_SortKeys[Index2]; + + if (ShouldSwap(A, B)) + { + // Swap the keys + gs_SortKeys[Index1] = B; + gs_SortKeys[Index2] = A; + +#ifdef BITONICSORT_64BIT + // Then swap the indices (for 64-bit sorts) + A = gs_SortIndices[Index1]; + B = gs_SortIndices[Index2]; + gs_SortIndices[Index1] = B; + gs_SortIndices[Index2] = A; +#endif + } + + GroupMemoryBarrierWithGroupSync(); + } + } + + // Write sorted results to memory + StoreKeyIndexPair(GroupStart + GI, ListCount); + StoreKeyIndexPair(GroupStart + GI + 1024, ListCount); +} diff --git a/external/BitonicSort/BitonicPrepareIndirectArgs.cs.hlsl b/external/BitonicSort/BitonicPrepareIndirectArgs.cs.hlsl new file mode 100644 index 0000000..b5504ad --- /dev/null +++ b/external/BitonicSort/BitonicPrepareIndirectArgs.cs.hlsl @@ -0,0 +1,62 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// This code is licensed under the MIT License (MIT). +// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY +// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR +// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. +// +// Developed by Minigraph +// +// Author: James Stanard +// + +#include "BitonicSortCommon.hlsli" + +RWByteAddressBuffer g_IndirectArgsBuffer : register(u0); + +cbuffer Constants : register(b0) +{ + uint MaxIterations; +} + +uint NextPow2( uint Val ) +{ + uint Mask = (1u << firstbithigh(Val)) - 1; + return (Val + Mask) & ~Mask; +} + +[RootSignature(BitonicSort_RootSig)] +[numthreads(22, 1, 1)] +void main( uint GI : SV_GroupIndex ) +{ + if (GI >= MaxIterations) + return; + + uint ListCount = g_CounterBuffer.Load(CounterOffset); + uint k = 2048u << GI; + + // We need one more iteration every time the number of thread groups doubles + if (k > NextPow2((ListCount + 2047) & ~2047)) + ListCount = 0; + + uint PrevDispatches = GI * (GI + 1) / 2; + uint Offset = 12 * PrevDispatches; + + // Generate outer sort dispatch arguments + for (uint j = k / 2; j > 1024; j /= 2) + { + // All of the groups of size 2j that are full + uint CompleteGroups = (ListCount & ~(2 * j - 1)) / 2048; + + // Remaining items must only be sorted if there are more than j of them + uint PartialGroups = ((uint)max(int(ListCount - CompleteGroups * 2048 - j), 0) + 1023) / 1024; + + g_IndirectArgsBuffer.Store3(Offset, uint3(CompleteGroups + PartialGroups, 1, 1)); + + Offset += 12; + } + + // The inner sort always sorts all groups (rounded up to multiples of 2048) + g_IndirectArgsBuffer.Store3(Offset, uint3((ListCount + 2047) / 2048, 1, 1)); +} diff --git a/external/BitonicSort/BitonicSortCommon.hlsli b/external/BitonicSort/BitonicSortCommon.hlsli new file mode 100644 index 0000000..7dee18e --- /dev/null +++ b/external/BitonicSort/BitonicSortCommon.hlsli @@ -0,0 +1,59 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// This code is licensed under the MIT License (MIT). +// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY +// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR +// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. +// +// Developed by Minigraph +// +// Author: James Stanard +// + +#define BitonicSort_RootSig \ + "RootFlags(0), " \ + "RootConstants(b0, num32BitConstants = 2)," \ + "DescriptorTable(SRV(t0, numDescriptors = 1))," \ + "DescriptorTable(UAV(u0, numDescriptors = 1))," \ + "RootConstants(b1, num32BitConstants = 2)" + +ByteAddressBuffer g_CounterBuffer : register(t0); + +cbuffer CB1 : register(b1) +{ + // Offset into counter buffer where this list's item count is stored + uint CounterOffset; + + // A sort key that will end up at the end of the list; to be used to pad + // lists in LDS (always 2048 items). + // Descending: 0x00000000 + // Ascending: 0xffffffff + // Also used by the ShouldSwap() function to invert ordering. + uint NullItem; +} + +// Takes Value and widens it by one bit at the location of the bit +// in the mask. A one is inserted in the space. OneBitMask must +// have one and only one bit set. +uint InsertOneBit( uint Value, uint OneBitMask ) +{ + uint Mask = OneBitMask - 1; + return (Value & ~Mask) << 1 | (Value & Mask) | OneBitMask; +} + +// Determines if two sort keys should be swapped in the list. NullItem is +// either 0 or 0xffffffff. XOR with the NullItem will either invert the bits +// (effectively a negation) or leave the bits alone. When the the NullItem is +// 0, we are sorting descending, so when A < B, they should swap. For an +// ascending sort, ~A < ~B should swap. +bool ShouldSwap(uint A, uint B) +{ + return (A ^ NullItem) < (B ^ NullItem); +} + +// Same as above, but only compares the upper 32-bit word. +bool ShouldSwap(uint2 A, uint2 B) +{ + return (A.y ^ NullItem) < (B.y ^ NullItem); +} diff --git a/external/README.md b/external/README.md new file mode 100644 index 0000000..329acda --- /dev/null +++ b/external/README.md @@ -0,0 +1,29 @@ +ThreeL External Dependencies +=============================================================================== + +Most of ThreeL's dependencies are incorporated into this repository directly for the sake of simplicity (a few others come from NuGet.) + +They're taken from the following versions: + +| Dependency | File/Directory | Upstream Version | Modified | +|------------|----------------|------------------|---------------| +| Dear ImGui | `DearImGui/` | [`docking a88e5be7f4`](https://github.com/ocornut/imgui/tree/a88e5be7f478233e74c72c72eabb1d5f1cb69bb5) | Yes +| JSON for Modern C++ | `json.hpp` | [`v3.10.4`](https://github.com/nlohmann/json/tree/fec56a1a16c6e1c1b1f4e116a20e79398282626c) | +| Minigraph Bitonic Sort | `BitonicSort/` | [`a79e01c4c3`](https://github.com/microsoft/DirectX-Graphics-Samples/tree/a79e01c4c39e6d40f4b078688ff95814d166d34f) | Yes +| Sponza | `Assets/Sponza/` | [`189f80d7d4`](https://github.com/KhronosGroup/glTF-Sample-Models/tree/189f80d7d44f76d8f9be8e337d4c6cb85ef521a4) | +| stb_image | `stb_image.h` | [`TinyGLTF fork v2.8.9`](https://github.com/syoyo/tinygltf/tree/350c2968025882bdf823e7892d02328548b46435) | +| TinyGLTF | `tiny_gltf.h` | [`v2.8.9`](https://github.com/syoyo/tinygltf/tree/350c2968025882bdf823e7892d02328548b46435) | +| xxHash | `xxhash.c`/`xxhash.h` | [`0656ed7539`](https://github.com/Cyan4973/xxHash/tree/0656ed753994ce3d04a39ca132242e98fddef136) | + +See [the third-party notice listing](../THIRD-PARTY-NOTICES.md) for details on licenses governing these dependencies. + +## Changes made + +## Dear ImGui + +* Added names to Direct3D 12 objects created by the backend + +## Minigraph Bitonic Sort + +* Renamed files +* Fixed some compiler warnings in `BitonicPrepareIndirectArgs.cs.hlsl` diff --git a/tooling/Common.targets b/tooling/Common.targets index 4b0d495..61a682d 100644 --- a/tooling/Common.targets +++ b/tooling/Common.targets @@ -21,8 +21,9 @@ NotUsing - + +