Skip to content

Commit

Permalink
Implemented particle sorting.
Browse files Browse the repository at this point in the history
  • Loading branch information
PathogenDavid committed Jul 24, 2023
1 parent 970cb91 commit 626e86e
Show file tree
Hide file tree
Showing 23 changed files with 888 additions and 6 deletions.
29 changes: 29 additions & 0 deletions THIRD-PARTY-NOTICES.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ ThreeL incorporates third-party libraries and assets licensed as follows.
- [DirectX Shader Compiler](#directx-shader-compiler)
- [Kenney Particle Pack](#kenney-particle-pack)
- [JSON for Modern C++](#json-for-modern-c)
- [MiniEngine Bitonic Sort](#miniengine-bitonic-sort)
- [Sponza](#sponza)
- [stb](#stb)
- [TinyGLTF](#tinygltf)
Expand Down Expand Up @@ -342,6 +343,34 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
```

# MiniEngine Bitonic Sort

https://github.com/microsoft/DirectX-Graphics-Samples/tree/b5f92e2251ee83db4d4c795b3cba5d470c52eaf8/MiniEngine

```
The MIT License (MIT)
Copyright (c) 2013-2015 Microsoft
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
```

# Sponza

https://github.com/KhronosGroup/glTF-Sample-Models/tree/189f80d7d44f76d8f9be8e337d4c6cb85ef521a4/2.0/Sponza
Expand Down
199 changes: 199 additions & 0 deletions ThreeL/BitonicSort.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
#include "pch.h"
#include "BitonicSort.h"

#include "ComputeContext.h"
#include "GpuResource.h"
#include "GraphicsContext.h"
#include "GraphicsCore.h"
#include "HlslCompiler.h"
#include "UavCounter.h"

BitonicSort::BitonicSort(GraphicsCore& graphics, HlslCompiler& hlslCompiler)
{
// Compile all shaders
ShaderBlobs prepareIndirectArgs = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicPrepareIndirectArgs.cs.hlsl", L"main", L"cs_6_0");
ShaderBlobs preSortCombined = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicPreSort.cs.hlsl", L"main", L"cs_6_0");
ShaderBlobs innerSortCombined = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicInnerSort.cs.hlsl", L"main", L"cs_6_0");
ShaderBlobs outerSortCombined = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicOuterSort.cs.hlsl", L"main", L"cs_6_0");
ShaderBlobs preSortSeparate = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicPreSort.cs.hlsl", L"main", L"cs_6_0", { L"BITONICSORT_64BIT" });
ShaderBlobs innerSortSeparate = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicInnerSort.cs.hlsl", L"main", L"cs_6_0", { L"BITONICSORT_64BIT" });
ShaderBlobs outerSortSeparate = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicOuterSort.cs.hlsl", L"main", L"cs_6_0", { L"BITONICSORT_64BIT" });

// Create root signature and pipeline state objects
m_RootSignature = RootSignature(graphics, prepareIndirectArgs, L"Bitonic Sort Root Signature");

D3D12_COMPUTE_PIPELINE_STATE_DESC description =
{
.pRootSignature = m_RootSignature.Get(),
.CS = prepareIndirectArgs.ShaderBytecode(),
};
m_PrepareIndirectArgs = PipelineStateObject(graphics, description, L"Bitonic Sort Prepare Indirect Args");
description.CS = preSortCombined.ShaderBytecode();
m_PreSortCombined = PipelineStateObject(graphics, description, L"Bitonic Pre-Sort (Combined)");
description.CS = innerSortCombined.ShaderBytecode();
m_InnerSortCombined = PipelineStateObject(graphics, description, L"Bitonic Inner Sort (Combined)");
description.CS = outerSortCombined.ShaderBytecode();
m_OuterSortCombined = PipelineStateObject(graphics, description, L"Bitonic Outer Sort (Combined)");
description.CS = preSortSeparate.ShaderBytecode();
m_PreSortSeparate = PipelineStateObject(graphics, description, L"Bitonic Pre-Sort (Separate)");
description.CS = innerSortSeparate.ShaderBytecode();
m_InnerSortSeparate = PipelineStateObject(graphics, description, L"Bitonic Inner Sort (Separate)");
description.CS = outerSortSeparate.ShaderBytecode();
m_OuterSortSeparate = PipelineStateObject(graphics, description, L"Bitonic Outer Sort (Separate)");

// Create buffers for indirect arguments
D3D12_HEAP_PROPERTIES heapProperties = { D3D12_HEAP_TYPE_DEFAULT };
const uint32_t elementCount = 22 * 23 / 2;
const uint32_t bufferSize = elementCount * sizeof(D3D12_DISPATCH_ARGUMENTS);
D3D12_RESOURCE_DESC indirectArgumentsDescription = DescribeBufferResource(bufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
for (int i = 0; i < 2; i++)
{
ComPtr<ID3D12Resource> indirectArguments;
AssertSuccess(graphics.Device()->CreateCommittedResource
(
&heapProperties,
D3D12_HEAP_FLAG_CREATE_NOT_ZEROED,
&indirectArgumentsDescription,
D3D12_RESOURCE_STATE_COMMON,
nullptr,
IID_PPV_ARGS(&indirectArguments)
));

D3D12_UNORDERED_ACCESS_VIEW_DESC uavDescription =
{
.Format = DXGI_FORMAT_R32_TYPELESS,
.ViewDimension = D3D12_UAV_DIMENSION_BUFFER,
.Buffer =
{
.FirstElement = 0,
.NumElements = bufferSize / sizeof(uint32_t),
.Flags = D3D12_BUFFER_UAV_FLAG_RAW,
},
};
ResourceDescriptor indirectArgumentsUav = graphics.ResourceDescriptorManager().CreateUnorderedAccessView(indirectArguments.Get(), nullptr, uavDescription);

switch (i)
{
case 0:
indirectArguments->SetName(L"Bitonic Sort Indirect Arguments (Graphics Queue)");
m_GraphicsIndirectArgsBuffer = RawGpuResource(std::move(indirectArguments));
m_GraphicsIndirectArgsBufferUav = indirectArgumentsUav;
break;
case 1:
indirectArguments->SetName(L"Bitonic Sort Indirect Arguments (Compute Queue)");
m_ComputeIndirectArgsBuffer = RawGpuResource(std::move(indirectArguments));
m_ComputeIndirectArgsBufferUav = indirectArgumentsUav;
break;
default:
Fail("Unreachable");
}
}
}

namespace BitonicShader
{
enum RootParameters
{
RpGeneralArgs,
RpCounterBuffer,
RpSortBuffer,
RpIndirectArgs = RpSortBuffer,
RpSortArgs,
};

struct SortArgs
{
uint32_t CounterOffset;
uint32_t NullItem;
};

struct OuterSortArgs
{
uint32_t k;
uint32_t j;
};
}

void BitonicSort::Sort(ComputeContext& context, const BitonicSortParams& params)
{
Assert(params.Capacity > 1);

// MiniEngine doesn't assert this in their bitonic sort dispatch, but I'm pretty sure it's requred for sorting ascending to work correctly
// For descending the out of bounds reads will be 0, so
Assert(Math::IsPowerOfTwo(params.Capacity));

uint32_t alignedCapacity = Math::AlignPowerOfTwo(params.Capacity);
uint32_t maxIterations = Math::Log2(std::max(2048u, alignedCapacity)) - 10;

// Select the indirect arguments buffer to use based on the command queue we'll be submitted to
// (We need them to be separate to avoid conflcits between sorts potentially happening concurrently between async compute and graphics queues.)
RawGpuResource& indirectArgsBuffer = context.QueueType() == D3D12_COMMAND_LIST_TYPE_COMPUTE ? m_ComputeIndirectArgsBuffer : m_GraphicsIndirectArgsBuffer;
ResourceDescriptor& indirectArgsUav = context.QueueType() == D3D12_COMMAND_LIST_TYPE_COMPUTE ? m_ComputeIndirectArgsBufferUav : m_GraphicsIndirectArgsBufferUav;

// Set common root signature arguments
context->SetComputeRootSignature(m_RootSignature);
BitonicShader::SortArgs sortArgs =
{
.CounterOffset = 0,
.NullItem = params.SortAscending ? 0xFFFFFFFF : 0x00000000,
};
context->SetComputeRoot32BitConstants(BitonicShader::RpSortArgs, sizeof(sortArgs) / sizeof(uint32_t), &sortArgs, 0);

// Prepare indirect dispatch arguments
context->SetPipelineState(m_PrepareIndirectArgs);
context.TransitionResource(params.ItemCountBuffer, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
context.TransitionResource(indirectArgsBuffer, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, true);
context->SetComputeRoot32BitConstant(BitonicShader::RpGeneralArgs, maxIterations, 0);
context->SetComputeRootDescriptorTable(BitonicShader::RpCounterBuffer, params.ItemCountBuffer.Srv().ResidentHandle());
context->SetComputeRootDescriptorTable(BitonicShader::RpIndirectArgs, indirectArgsUav.ResidentHandle());
context.Dispatch(1);

// Pre-sort the list up to k = 2048
// This will also pad the list with the NullItem determined above so that the rest of the algorithm can operate without caring about the number of items used
//TODO: I don't think the NullItem thing is actually implemented correctly.
// I think the intent was that StoreKeyIndexPair in BitonicPreSort should've been checking the capacity of the list rather than the count.
// Maybe I'm missing something, but isn't the idea that you can skip all the bounds checks in InnerSort/OuterSort? The MiniEngine bitonic sort still checks it all the time.
// (If it didn't it'd end up barfing on SortAscending when the sort buffer's capacity isn't a power of two anyway -- it would be relying on out of bounds UAV accesses on tabled descriptors reading 0.)
// Ah ha, I'm not crazy. The implementation was changed to support non-power-of-two-sized lists and in the process the null padding was broken and made useless.
// https://github.com/microsoft/DirectX-Graphics-Samples/commit/def3a2cb9fb49f3005349a6238662729b16baf68
// Unfortunately the old implementation has its own problems, and I'm too half awake to implement my own bitoic sort.
// Plus I just want my particles to be sorted. Maybe some other time...
context.TransitionResource(indirectArgsBuffer, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT);
context.TransitionResource(params.SortList, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
context.UavBarrier(params.SortList);
context->SetComputeRootDescriptorTable(BitonicShader::RpSortBuffer, params.SortListUav.ResidentHandle());

if (!params.SkipPreSort)
{
context->SetPipelineState(params.ItemKind == BitonicSortParams::CombinedKeyIndex ? m_PreSortCombined : m_PreSortSeparate);
context.DispatchIndirect(indirectArgsBuffer);
context.UavBarrier(params.SortList);
}

// Pre-sorting took care of swaps for k up to 2048, so now we continue at k = 4096
// (Note that some of the outer sorts will be skipped by dispatching zero-sized groups as needed once k grows too large)
uint32_t indirectArgsOffset = sizeof(D3D12_DISPATCH_ARGUMENTS); // Start after pre-sort args
for (uint32_t k = 4096; k <= alignedCapacity; k *= 2)
{
// Outer sort iterations -- Swaps for which the distance (j) exceeds the width of the LDS and goes directly through memory
context->SetPipelineState(params.ItemKind == BitonicSortParams::CombinedKeyIndex ? m_OuterSortCombined : m_OuterSortSeparate);
for (uint32_t j = k / 2; j >= 2048; j /= 2)
{
BitonicShader::OuterSortArgs outerArgs =
{
.k = k,
.j = j,
};
context->SetComputeRoot32BitConstants(BitonicShader::RpGeneralArgs, sizeof(outerArgs) / sizeof(uint32_t), &outerArgs, 0);
context.DispatchIndirect(indirectArgsBuffer, indirectArgsOffset);
indirectArgsOffset += sizeof(D3D12_DISPATCH_ARGUMENTS);
context.UavBarrier(params.SortList);
}

// Inner sort iteration -- Swaps for which the distance (j) fits within LDS so looping over j occurs within the shader directly
context->SetPipelineState(params.ItemKind == BitonicSortParams::CombinedKeyIndex ? m_InnerSortCombined : m_InnerSortSeparate);
context.DispatchIndirect(indirectArgsBuffer, indirectArgsOffset);
indirectArgsOffset += sizeof(D3D12_DISPATCH_ARGUMENTS);
context.UavBarrier(params.SortList);
}
}
69 changes: 69 additions & 0 deletions ThreeL/BitonicSort.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#pragma once
#include "RawGpuResource.h"
#include "ResourceDescriptor.h"
#include "RootSignature.h"
#include "PipelineStateObject.h"

struct ComputeContext;
class GpuResource;
struct GraphicsContext;
class GraphicsCore;
class HlslCompiler;
class UavCounter;

struct BitonicSortParams
{
enum SortItemKind
{
//! The sort key and index have been combined into a single 32-bit value, with the sort key in the upper bits
//! This format results in better sort performance at the expense of smaller indices
CombinedKeyIndex,
//! The sort key and index are separated in a pair of 32-bit values, with the index in X and the sort key in Y
//! This format uses more memory bandwidth and thus results in slower sorting, but you get the full 32-bit range for the indices
SeparateKeyIndex,
};

//! The list to be sorted
GpuResource& SortList;
ResourceDescriptor SortListUav;
//! The maximum number of items that SortList can hold
uint32_t Capacity;
//! The format of the elements within SortList
SortItemKind ItemKind;
//! A UavCounter specifying the number of valid entries in SortList
UavCounter& ItemCountBuffer;

//! If true, the pre-sorting phase will be skipped.
//! The caller asserts that the buffer is already partially sorted in blocks of 2048, meaning that the pre-sorting phase can be skipped.
//! (This might be the case if the sort list was built in chunks of groupshared memory.)
bool SkipPreSort;

bool SortAscending;
};

class BitonicSort
{
private:
RootSignature m_RootSignature;

PipelineStateObject m_PrepareIndirectArgs;

PipelineStateObject m_PreSortCombined;
PipelineStateObject m_InnerSortCombined;
PipelineStateObject m_OuterSortCombined;

PipelineStateObject m_PreSortSeparate;
PipelineStateObject m_InnerSortSeparate;
PipelineStateObject m_OuterSortSeparate;

RawGpuResource m_GraphicsIndirectArgsBuffer;
ResourceDescriptor m_GraphicsIndirectArgsBufferUav;
RawGpuResource m_ComputeIndirectArgsBuffer;
ResourceDescriptor m_ComputeIndirectArgsBufferUav;

public:
BitonicSort() = default;
BitonicSort(GraphicsCore& graphics, HlslCompiler& hlslCompiler);

void Sort(ComputeContext& context, const BitonicSortParams& params);
};
30 changes: 30 additions & 0 deletions ThreeL/MathCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,34 @@ namespace Math
{
return (numerator + denominator - 1) / denominator;
}

inline uint8_t Log2(uint64_t x)
{
DWORD mostSignificantBit;
DWORD leastSignificantBit;

if (_BitScanReverse64(&mostSignificantBit, x) && _BitScanForward64(&leastSignificantBit, x))
{
uint8_t result = (uint8_t)mostSignificantBit;

// If x is not a perfect power of two (IE: multiple bits set) we round up to the next power of two
if (mostSignificantBit != leastSignificantBit)
{ result++; }

return result;
}

return 0;
}

template<typename T>
inline T AlignPowerOfTwo(T x)
{
return x == 0 ? 0 : 1 << Log2(x);
}

inline bool IsPowerOfTwo(uint32_t x)
{
return x && !(x & (x - 1u));
}
}
Loading

0 comments on commit 626e86e

Please sign in to comment.