From 626e86ed65939f82b4da69c33368641386f58152 Mon Sep 17 00:00:00 2001
From: David Maas <contact@pathogenstudios.com>
Date: Mon, 24 Jul 2023 15:21:21 -0500
Subject: [PATCH] Implemented particle sorting.

---
 THIRD-PARTY-NOTICES.md                        |  29 +++
 ThreeL/BitonicSort.cpp                        | 199 ++++++++++++++++++
 ThreeL/BitonicSort.h                          |  69 ++++++
 ThreeL/MathCommon.h                           |  30 +++
 ThreeL/ParticleSystem.cpp                     |  45 +++-
 ThreeL/ParticleSystem.h                       |   3 +
 ThreeL/ResourceManager.cpp                    |   3 +
 ThreeL/ResourceManager.h                      |   5 +-
 ThreeL/ShaderInterop.h                        |   2 +
 ThreeL/Shaders/ParticleRender.hlsl            |   5 +-
 ThreeL/Shaders/ParticleSystem.cs.hlsl         |   8 +-
 ThreeL/ThreeL.props                           |   1 +
 ThreeL/ThreeL.vcxproj                         |   7 +
 ThreeL/ThreeL.vcxproj.filters                 |  20 ++
 ThreeL/UavCounter.cpp                         |  16 +-
 ThreeL/UavCounter.h                           |   2 +
 external/BitonicSort/BitonicInnerSort.cs.hlsl | 114 ++++++++++
 external/BitonicSort/BitonicOuterSort.cs.hlsl |  55 +++++
 external/BitonicSort/BitonicPreSort.cs.hlsl   | 128 +++++++++++
 .../BitonicPrepareIndirectArgs.cs.hlsl        |  62 ++++++
 external/BitonicSort/BitonicSortCommon.hlsli  |  59 ++++++
 external/README.md                            |  29 +++
 tooling/Common.targets                        |   3 +-
 23 files changed, 888 insertions(+), 6 deletions(-)
 create mode 100644 ThreeL/BitonicSort.cpp
 create mode 100644 ThreeL/BitonicSort.h
 create mode 100644 external/BitonicSort/BitonicInnerSort.cs.hlsl
 create mode 100644 external/BitonicSort/BitonicOuterSort.cs.hlsl
 create mode 100644 external/BitonicSort/BitonicPreSort.cs.hlsl
 create mode 100644 external/BitonicSort/BitonicPrepareIndirectArgs.cs.hlsl
 create mode 100644 external/BitonicSort/BitonicSortCommon.hlsli
 create mode 100644 external/README.md

diff --git a/THIRD-PARTY-NOTICES.md b/THIRD-PARTY-NOTICES.md
index 8d1686e..6bc1fb8 100644
--- a/THIRD-PARTY-NOTICES.md
+++ b/THIRD-PARTY-NOTICES.md
@@ -7,6 +7,7 @@ ThreeL incorporates third-party libraries and assets licensed as follows.
 - [DirectX Shader Compiler](#directx-shader-compiler)
 - [Kenney Particle Pack](#kenney-particle-pack)
 - [JSON for Modern C++](#json-for-modern-c)
+- [MiniEngine Bitonic Sort](#miniengine-bitonic-sort)
 - [Sponza](#sponza)
 - [stb](#stb)
 - [TinyGLTF](#tinygltf)
@@ -342,6 +343,34 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 ```
 
+# MiniEngine Bitonic Sort
+
+https://github.com/microsoft/DirectX-Graphics-Samples/tree/b5f92e2251ee83db4d4c795b3cba5d470c52eaf8/MiniEngine
+
+```
+The MIT License (MIT)
+
+Copyright (c) 2013-2015 Microsoft
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+```
+
 # Sponza
 
 https://github.com/KhronosGroup/glTF-Sample-Models/tree/189f80d7d44f76d8f9be8e337d4c6cb85ef521a4/2.0/Sponza
diff --git a/ThreeL/BitonicSort.cpp b/ThreeL/BitonicSort.cpp
new file mode 100644
index 0000000..f4c16c2
--- /dev/null
+++ b/ThreeL/BitonicSort.cpp
@@ -0,0 +1,199 @@
+#include "pch.h"
+#include "BitonicSort.h"
+
+#include "ComputeContext.h"
+#include "GpuResource.h"
+#include "GraphicsContext.h"
+#include "GraphicsCore.h"
+#include "HlslCompiler.h"
+#include "UavCounter.h"
+
+BitonicSort::BitonicSort(GraphicsCore& graphics, HlslCompiler& hlslCompiler)
+{
+    // Compile all shaders
+    ShaderBlobs prepareIndirectArgs = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicPrepareIndirectArgs.cs.hlsl", L"main", L"cs_6_0");
+    ShaderBlobs preSortCombined = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicPreSort.cs.hlsl", L"main", L"cs_6_0");
+    ShaderBlobs innerSortCombined = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicInnerSort.cs.hlsl", L"main", L"cs_6_0");
+    ShaderBlobs outerSortCombined = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicOuterSort.cs.hlsl", L"main", L"cs_6_0");
+    ShaderBlobs preSortSeparate = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicPreSort.cs.hlsl", L"main", L"cs_6_0", { L"BITONICSORT_64BIT" });
+    ShaderBlobs innerSortSeparate = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicInnerSort.cs.hlsl", L"main", L"cs_6_0", { L"BITONICSORT_64BIT" });
+    ShaderBlobs outerSortSeparate = hlslCompiler.CompileShader(L"Shaders/BitonicSort/BitonicOuterSort.cs.hlsl", L"main", L"cs_6_0", { L"BITONICSORT_64BIT" });
+
+    // Create root signature and pipeline state objects
+    m_RootSignature = RootSignature(graphics, prepareIndirectArgs, L"Bitonic Sort Root Signature");
+
+    D3D12_COMPUTE_PIPELINE_STATE_DESC description =
+    {
+        .pRootSignature = m_RootSignature.Get(),
+        .CS = prepareIndirectArgs.ShaderBytecode(),
+    };
+    m_PrepareIndirectArgs = PipelineStateObject(graphics, description, L"Bitonic Sort Prepare Indirect Args");
+    description.CS = preSortCombined.ShaderBytecode();
+    m_PreSortCombined = PipelineStateObject(graphics, description, L"Bitonic Pre-Sort (Combined)");
+    description.CS = innerSortCombined.ShaderBytecode();
+    m_InnerSortCombined = PipelineStateObject(graphics, description, L"Bitonic Inner Sort (Combined)");
+    description.CS = outerSortCombined.ShaderBytecode();
+    m_OuterSortCombined = PipelineStateObject(graphics, description, L"Bitonic Outer Sort (Combined)");
+    description.CS = preSortSeparate.ShaderBytecode();
+    m_PreSortSeparate = PipelineStateObject(graphics, description, L"Bitonic Pre-Sort (Separate)");
+    description.CS = innerSortSeparate.ShaderBytecode();
+    m_InnerSortSeparate = PipelineStateObject(graphics, description, L"Bitonic Inner Sort (Separate)");
+    description.CS = outerSortSeparate.ShaderBytecode();
+    m_OuterSortSeparate = PipelineStateObject(graphics, description, L"Bitonic Outer Sort (Separate)");
+
+    // Create buffers for indirect arguments
+    D3D12_HEAP_PROPERTIES heapProperties = { D3D12_HEAP_TYPE_DEFAULT };
+    const uint32_t elementCount = 22 * 23 / 2;
+    const uint32_t bufferSize = elementCount * sizeof(D3D12_DISPATCH_ARGUMENTS);
+    D3D12_RESOURCE_DESC indirectArgumentsDescription = DescribeBufferResource(bufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
+    for (int i = 0; i < 2; i++)
+    {
+        ComPtr<ID3D12Resource> indirectArguments;
+        AssertSuccess(graphics.Device()->CreateCommittedResource
+        (
+            &heapProperties,
+            D3D12_HEAP_FLAG_CREATE_NOT_ZEROED,
+            &indirectArgumentsDescription,
+            D3D12_RESOURCE_STATE_COMMON,
+            nullptr,
+            IID_PPV_ARGS(&indirectArguments)
+        ));
+
+        D3D12_UNORDERED_ACCESS_VIEW_DESC uavDescription =
+        {
+            .Format = DXGI_FORMAT_R32_TYPELESS,
+            .ViewDimension = D3D12_UAV_DIMENSION_BUFFER,
+            .Buffer =
+            {
+                .FirstElement = 0,
+                .NumElements = bufferSize / sizeof(uint32_t),
+                .Flags = D3D12_BUFFER_UAV_FLAG_RAW,
+            },
+        };
+        ResourceDescriptor indirectArgumentsUav = graphics.ResourceDescriptorManager().CreateUnorderedAccessView(indirectArguments.Get(), nullptr, uavDescription);
+
+        switch (i)
+        {
+            case 0:
+                indirectArguments->SetName(L"Bitonic Sort Indirect Arguments (Graphics Queue)");
+                m_GraphicsIndirectArgsBuffer = RawGpuResource(std::move(indirectArguments));
+                m_GraphicsIndirectArgsBufferUav = indirectArgumentsUav;
+                break;
+            case 1:
+                indirectArguments->SetName(L"Bitonic Sort Indirect Arguments (Compute Queue)");
+                m_ComputeIndirectArgsBuffer = RawGpuResource(std::move(indirectArguments));
+                m_ComputeIndirectArgsBufferUav = indirectArgumentsUav;
+                break;
+            default:
+                Fail("Unreachable");
+        }
+    }
+}
+
+namespace BitonicShader
+{
+    enum RootParameters
+    {
+        RpGeneralArgs,
+        RpCounterBuffer,
+        RpSortBuffer,
+        RpIndirectArgs = RpSortBuffer,
+        RpSortArgs,
+    };
+
+    struct SortArgs
+    {
+        uint32_t CounterOffset;
+        uint32_t NullItem;
+    };
+
+    struct OuterSortArgs
+    {
+        uint32_t k;
+        uint32_t j;
+    };
+}
+
+void BitonicSort::Sort(ComputeContext& context, const BitonicSortParams& params)
+{
+    Assert(params.Capacity > 1);
+
+    // MiniEngine doesn't assert this in their bitonic sort dispatch, but I'm pretty sure it's requred for sorting ascending to work correctly
+    // For descending the out of bounds reads will be 0, so 
+    Assert(Math::IsPowerOfTwo(params.Capacity));
+
+    uint32_t alignedCapacity = Math::AlignPowerOfTwo(params.Capacity);
+    uint32_t maxIterations = Math::Log2(std::max(2048u, alignedCapacity)) - 10;
+
+    // Select the indirect arguments buffer to use based on the command queue we'll be submitted to
+    // (We need them to be separate to avoid conflcits between sorts potentially happening concurrently between async compute and graphics queues.)
+    RawGpuResource& indirectArgsBuffer = context.QueueType() == D3D12_COMMAND_LIST_TYPE_COMPUTE ? m_ComputeIndirectArgsBuffer : m_GraphicsIndirectArgsBuffer;
+    ResourceDescriptor& indirectArgsUav = context.QueueType() == D3D12_COMMAND_LIST_TYPE_COMPUTE ? m_ComputeIndirectArgsBufferUav : m_GraphicsIndirectArgsBufferUav;
+
+    // Set common root signature arguments
+    context->SetComputeRootSignature(m_RootSignature);
+    BitonicShader::SortArgs sortArgs =
+    {
+        .CounterOffset = 0,
+        .NullItem = params.SortAscending ? 0xFFFFFFFF : 0x00000000,
+    };
+    context->SetComputeRoot32BitConstants(BitonicShader::RpSortArgs, sizeof(sortArgs) / sizeof(uint32_t), &sortArgs, 0);
+
+    // Prepare indirect dispatch arguments
+    context->SetPipelineState(m_PrepareIndirectArgs);
+    context.TransitionResource(params.ItemCountBuffer, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
+    context.TransitionResource(indirectArgsBuffer, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, true);
+    context->SetComputeRoot32BitConstant(BitonicShader::RpGeneralArgs, maxIterations, 0);
+    context->SetComputeRootDescriptorTable(BitonicShader::RpCounterBuffer, params.ItemCountBuffer.Srv().ResidentHandle());
+    context->SetComputeRootDescriptorTable(BitonicShader::RpIndirectArgs, indirectArgsUav.ResidentHandle());
+    context.Dispatch(1);
+
+    // Pre-sort the list up to k = 2048
+    // This will also pad the list with the NullItem determined above so that the rest of the algorithm can operate without caring about the number of items used
+    //TODO: I don't think the NullItem thing is actually implemented correctly.
+    // I think the intent was that StoreKeyIndexPair in BitonicPreSort should've been checking the capacity of the list rather than the count.
+    // Maybe I'm missing something, but isn't the idea that you can skip all the bounds checks in InnerSort/OuterSort? The MiniEngine bitonic sort still checks it all the time.
+    // (If it didn't it'd end up barfing on SortAscending when the sort buffer's capacity isn't a power of two anyway -- it would be relying on out of bounds UAV accesses on tabled descriptors reading 0.)
+    // Ah ha, I'm not crazy. The implementation was changed to support non-power-of-two-sized lists and in the process the null padding was broken and made useless.
+    // https://github.com/microsoft/DirectX-Graphics-Samples/commit/def3a2cb9fb49f3005349a6238662729b16baf68
+    // Unfortunately the old implementation has its own problems, and I'm too half awake to implement my own bitoic sort.
+    // Plus I just want my particles to be sorted. Maybe some other time...
+    context.TransitionResource(indirectArgsBuffer, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT);
+    context.TransitionResource(params.SortList, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+    context.UavBarrier(params.SortList);
+    context->SetComputeRootDescriptorTable(BitonicShader::RpSortBuffer, params.SortListUav.ResidentHandle());
+
+    if (!params.SkipPreSort)
+    {
+        context->SetPipelineState(params.ItemKind == BitonicSortParams::CombinedKeyIndex ? m_PreSortCombined : m_PreSortSeparate);
+        context.DispatchIndirect(indirectArgsBuffer);
+        context.UavBarrier(params.SortList);
+    }
+
+    // Pre-sorting took care of swaps for k up to 2048, so now we continue at k = 4096
+    // (Note that some of the outer sorts will be skipped by dispatching zero-sized groups as needed once k grows too large)
+    uint32_t indirectArgsOffset = sizeof(D3D12_DISPATCH_ARGUMENTS); // Start after pre-sort args
+    for (uint32_t k = 4096; k <= alignedCapacity; k *= 2)
+    {
+        // Outer sort iterations -- Swaps for which the distance (j) exceeds the width of the LDS and goes directly through memory
+        context->SetPipelineState(params.ItemKind == BitonicSortParams::CombinedKeyIndex ? m_OuterSortCombined : m_OuterSortSeparate);
+        for (uint32_t j = k / 2; j >= 2048; j /= 2)
+        {
+            BitonicShader::OuterSortArgs outerArgs =
+            {
+                .k = k,
+                .j = j,
+            };
+            context->SetComputeRoot32BitConstants(BitonicShader::RpGeneralArgs, sizeof(outerArgs) / sizeof(uint32_t), &outerArgs, 0);
+            context.DispatchIndirect(indirectArgsBuffer, indirectArgsOffset);
+            indirectArgsOffset += sizeof(D3D12_DISPATCH_ARGUMENTS);
+            context.UavBarrier(params.SortList);
+        }
+
+        // Inner sort iteration -- Swaps for which the distance (j) fits within LDS so looping over j occurs within the shader directly
+        context->SetPipelineState(params.ItemKind == BitonicSortParams::CombinedKeyIndex ? m_InnerSortCombined : m_InnerSortSeparate);
+        context.DispatchIndirect(indirectArgsBuffer, indirectArgsOffset);
+        indirectArgsOffset += sizeof(D3D12_DISPATCH_ARGUMENTS);
+        context.UavBarrier(params.SortList);
+    }
+}
diff --git a/ThreeL/BitonicSort.h b/ThreeL/BitonicSort.h
new file mode 100644
index 0000000..b249230
--- /dev/null
+++ b/ThreeL/BitonicSort.h
@@ -0,0 +1,69 @@
+#pragma once
+#include "RawGpuResource.h"
+#include "ResourceDescriptor.h"
+#include "RootSignature.h"
+#include "PipelineStateObject.h"
+
+struct ComputeContext;
+class GpuResource;
+struct GraphicsContext;
+class GraphicsCore;
+class HlslCompiler;
+class UavCounter;
+
+struct BitonicSortParams
+{
+    enum SortItemKind
+    {
+        //! The sort key and index have been combined into a single 32-bit value, with the sort key in the upper bits
+        //! This format results in better sort performance at the expense of smaller indices
+        CombinedKeyIndex,
+        //! The sort key and index are separated in a pair of 32-bit values, with the index in X and the sort key in Y
+        //! This format uses more memory bandwidth and thus results in slower sorting, but you get the full 32-bit range for the indices
+        SeparateKeyIndex,
+    };
+
+    //! The list to be sorted
+    GpuResource& SortList;
+    ResourceDescriptor SortListUav;
+    //! The maximum number of items that SortList can hold
+    uint32_t Capacity;
+    //! The format of the elements within SortList
+    SortItemKind ItemKind;
+    //! A UavCounter specifying the number of valid entries in SortList
+    UavCounter& ItemCountBuffer;
+
+    //! If true, the pre-sorting phase will be skipped.
+    //! The caller asserts that the buffer is already partially sorted in blocks of 2048, meaning that the pre-sorting phase can be skipped.
+    //! (This might be the case if the sort list was built in chunks of groupshared memory.)
+    bool SkipPreSort;
+
+    bool SortAscending;
+};
+
+class BitonicSort
+{
+private:
+    RootSignature m_RootSignature;
+    
+    PipelineStateObject m_PrepareIndirectArgs;
+
+    PipelineStateObject m_PreSortCombined;
+    PipelineStateObject m_InnerSortCombined;
+    PipelineStateObject m_OuterSortCombined;
+
+    PipelineStateObject m_PreSortSeparate;
+    PipelineStateObject m_InnerSortSeparate;
+    PipelineStateObject m_OuterSortSeparate;
+
+    RawGpuResource m_GraphicsIndirectArgsBuffer;
+    ResourceDescriptor m_GraphicsIndirectArgsBufferUav;
+    RawGpuResource m_ComputeIndirectArgsBuffer;
+    ResourceDescriptor m_ComputeIndirectArgsBufferUav;
+
+public:
+    BitonicSort() = default;
+    BitonicSort(GraphicsCore& graphics, HlslCompiler& hlslCompiler);
+
+    void Sort(ComputeContext& context, const BitonicSortParams& params);
+};
diff --git a/ThreeL/MathCommon.h b/ThreeL/MathCommon.h
index c124248..c14e1e2 100644
--- a/ThreeL/MathCommon.h
+++ b/ThreeL/MathCommon.h
@@ -34,4 +34,34 @@ namespace Math
     {
         return (numerator + denominator - 1) / denominator;
     }
+
+    inline uint8_t Log2(uint64_t x)
+    {
+        DWORD mostSignificantBit;
+        DWORD leastSignificantBit;
+
+        if (_BitScanReverse64(&mostSignificantBit, x) && _BitScanForward64(&leastSignificantBit, x))
+        {
+            uint8_t result = (uint8_t)mostSignificantBit;
+
+            // If x is not a perfect power of two (IE: multiple bits set) we round up to the next power of two
+            if (mostSignificantBit != leastSignificantBit)
+            { result++; }
+
+            return result;
+        }
+
+        return 0;
+    }
+
+    template<typename T>
+    inline T AlignPowerOfTwo(T x)
+    {
+        return x == 0 ? 0 : 1 << Log2(x);
+    }
+
+    inline bool IsPowerOfTwo(uint32_t x)
+    {
+        return x && !(x & (x - 1u));
+    }
 }
diff --git a/ThreeL/ParticleSystem.cpp b/ThreeL/ParticleSystem.cpp
index 7bdfcc0..1ca329d 100644
--- a/ThreeL/ParticleSystem.cpp
+++ b/ThreeL/ParticleSystem.cpp
@@ -73,6 +73,36 @@ ParticleSystem::ParticleSystem(ResourceManager& resources, const std::wstring& d
     spriteBuffer->SetName(std::format(L"'{}' Particle Sprites", debugName).c_str());
     m_ParticleSpriteBuffer = RawGpuResource(std::move(spriteBuffer));
 
+    // Allocate sort buffer
+    uint32_t sortBufferSizeBytes = sizeof(uint2) * m_Capacity;
+    D3D12_RESOURCE_DESC sortBufferDescription = DescribeBufferResource(sortBufferSizeBytes, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
+    ComPtr<ID3D12Resource> sortBuffer;
+    AssertSuccess(m_Graphics.Device()->CreateCommittedResource
+    (
+        &heapProperties,
+        D3D12_HEAP_FLAG_CREATE_NOT_ZEROED,
+        &sortBufferDescription,
+        D3D12_RESOURCE_STATE_COMMON,
+        nullptr,
+        IID_PPV_ARGS(&sortBuffer)
+    ));
+    sortBuffer->SetName(std::format(L"'{}' Particle Sort Buffer", debugName).c_str());
+
+    D3D12_UNORDERED_ACCESS_VIEW_DESC uavDescription =
+    {
+        .Format = DXGI_FORMAT_R32_TYPELESS,
+        .ViewDimension = D3D12_UAV_DIMENSION_BUFFER,
+        .Buffer =
+        {
+            .FirstElement = 0,
+            .NumElements = sortBufferSizeBytes / sizeof(uint32_t),
+            .Flags = D3D12_BUFFER_UAV_FLAG_RAW,
+        },
+    };
+
+    m_ParticleSpriteSortBufferUav = m_Graphics.ResourceDescriptorManager().CreateUnorderedAccessView(sortBuffer.Get(), nullptr, uavDescription);
+    m_ParticleSpriteSortBuffer = RawGpuResource(std::move(sortBuffer));
+
     // Allocate DrawIndirect arguments buffer
     D3D12_RESOURCE_DESC drawIndirectArgumentsDescription = DescribeBufferResource(sizeof(D3D12_DRAW_ARGUMENTS), D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
     ComPtr<ID3D12Resource> drawIndirectArguments;
@@ -135,6 +165,7 @@ void ParticleSystem::Update(ComputeContext& context, float deltaTime, D3D12_GPU_
     context->SetComputeRootUnorderedAccessView(ShaderInterop::ParticleSystem::RpParticleSpritesOut, m_ParticleSpriteBuffer.GpuAddress());
     context->SetComputeRootUnorderedAccessView(ShaderInterop::ParticleSystem::RpLivingParticleCountOut, outputStateBuffer.Counter.GpuAddress());
     context->SetComputeRootUnorderedAccessView(ShaderInterop::ParticleSystem::RpDrawIndirectArguments, m_DrawIndirectArguments.GpuAddress());
+    context->SetComputeRootUnorderedAccessView(ShaderInterop::ParticleSystem::RpParticleSpriteSortBuffer, m_ParticleSpriteSortBuffer.GpuAddress());
 
     // Update existing particles
     //PERF: For systems with very large capacities that are not near capacity, this ends up spawning a bunch of useless threads
@@ -166,11 +197,22 @@ void ParticleSystem::Update(ComputeContext& context, float deltaTime, D3D12_GPU_
     context.Dispatch(1);
 
     // Sort particle sprites
-    //TODO
+    BitonicSortParams sortParams =
+    {
+        .SortList = m_ParticleSpriteSortBuffer,
+        .SortListUav = m_ParticleSpriteSortBufferUav,
+        .Capacity = m_Capacity,
+        .ItemKind = BitonicSortParams::SeparateKeyIndex,
+        .ItemCountBuffer = outputStateBuffer.Counter,
+        .SkipPreSort = false,
+        .SortAscending = false,
+    };
+    m_Resources.BitonicSort.Sort(context, sortParams);
 
     // Transition all resources for their use in render
     context.UavBarrier();
     context.TransitionResource(m_ParticleSpriteBuffer, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
+    context.TransitionResource(m_ParticleSpriteSortBuffer, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
     context.TransitionResource(m_DrawIndirectArguments, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT);
 
     // Update complete, save a sync point for render
@@ -191,6 +233,7 @@ void ParticleSystem::Render(GraphicsContext& context, D3D12_GPU_VIRTUAL_ADDRESS
     context->SetGraphicsRootSignature(m_Resources.ParticleRenderRootSignature);
 
     context->SetGraphicsRootShaderResourceView(ShaderInterop::ParticleRender::RpParticleBuffer, m_ParticleSpriteBuffer.GpuAddress());
+    context->SetGraphicsRootShaderResourceView(ShaderInterop::ParticleRender::RpSortedParticleLookupBuffer, m_ParticleSpriteSortBuffer.GpuAddress());
     context->SetGraphicsRootConstantBufferView(ShaderInterop::ParticleRender::RpPerFrameCb, perFrameCb);
     context->SetGraphicsRootShaderResourceView(ShaderInterop::ParticleRender::RpMaterialHeap, m_Resources.PbrMaterials.BufferGpuAddress());
 
diff --git a/ThreeL/ParticleSystem.h b/ThreeL/ParticleSystem.h
index cfd513a..4d1bece 100644
--- a/ThreeL/ParticleSystem.h
+++ b/ThreeL/ParticleSystem.h
@@ -39,6 +39,9 @@ class ParticleSystem
 
     RawGpuResource m_ParticleSpriteBuffer;
 
+    RawGpuResource m_ParticleSpriteSortBuffer;
+    ResourceDescriptor m_ParticleSpriteSortBufferUav;
+
     RawGpuResource m_DrawIndirectArguments;
 
 public:
diff --git a/ThreeL/ResourceManager.cpp b/ThreeL/ResourceManager.cpp
index 7ca94e5..61421cc 100644
--- a/ThreeL/ResourceManager.cpp
+++ b/ThreeL/ResourceManager.cpp
@@ -1,6 +1,7 @@
 #include "pch.h"
 #include "ResourceManager.h"
 
+#include "GraphicsCore.h"
 #include "HlslCompiler.h"
 
 ResourceManager::ResourceManager(GraphicsCore& graphics)
@@ -8,6 +9,8 @@ ResourceManager::ResourceManager(GraphicsCore& graphics)
 {
     HlslCompiler hlslCompiler;
 
+    BitonicSort = ::BitonicSort(Graphics, hlslCompiler);
+
     // Compile all shaders
     ShaderBlobs pbrVs = hlslCompiler.CompileShader(L"Shaders/Pbr.hlsl", L"VsMain", L"vs_6_0");
     ShaderBlobs pbrPs = hlslCompiler.CompileShader(L"Shaders/Pbr.hlsl", L"PsMain", L"ps_6_0");
diff --git a/ThreeL/ResourceManager.h b/ThreeL/ResourceManager.h
index a7a30fb..e715734 100644
--- a/ThreeL/ResourceManager.h
+++ b/ThreeL/ResourceManager.h
@@ -1,10 +1,12 @@
 #pragma once
-#include "GraphicsCore.h"
+#include "BitonicSort.h"
 #include "MeshHeap.h"
 #include "PbrMaterialHeap.h"
 #include "PipelineStateObject.h"
 #include "RootSignature.h"
 
+class GraphicsCore;
+
 namespace MeshInputSlot
 {
     enum MeshInputSlot : UINT
@@ -20,6 +22,7 @@ struct ResourceManager
     GraphicsCore& Graphics;
     PbrMaterialHeap PbrMaterials;
     MeshHeap MeshHeap;
+    BitonicSort BitonicSort;
 
     // No complicated PSO management here, we don't need very many so we just make them all by hand
     RootSignature PbrRootSignature;
diff --git a/ThreeL/ShaderInterop.h b/ThreeL/ShaderInterop.h
index 08a771a..61b1884 100644
--- a/ThreeL/ShaderInterop.h
+++ b/ThreeL/ShaderInterop.h
@@ -104,6 +104,7 @@ namespace ShaderInterop
         enum RootParameters
         {
             RpParticleBuffer,
+            RpSortedParticleLookupBuffer,
             RpPerFrameCb,
             RpMaterialHeap,
             RpLightHeap,
@@ -177,6 +178,7 @@ namespace ShaderInterop
             RpParticleSpritesOut,
             RpLivingParticleCountOut,
             RpDrawIndirectArguments,
+            RpParticleSpriteSortBuffer,
         };
 
         static const uint32_t SpawnGroupSize = 64;
diff --git a/ThreeL/Shaders/ParticleRender.hlsl b/ThreeL/Shaders/ParticleRender.hlsl
index 14e7a7e..78b9f57 100644
--- a/ThreeL/Shaders/ParticleRender.hlsl
+++ b/ThreeL/Shaders/ParticleRender.hlsl
@@ -1,5 +1,6 @@
 #define PBR_ROOT_SIGNATURE \
-    "SRV(t0, space = 900, flags = DATA_STATIC_WHILE_SET_AT_EXECUTE)," \
+    "SRV(t0, space = 900, flags = DATA_STATIC_WHILE_SET_AT_EXECUTE, visibility = SHADER_VISIBILITY_VERTEX)," \
+    "SRV(t1, space = 900, flags = DATA_STATIC_WHILE_SET_AT_EXECUTE, visibility = SHADER_VISIBILITY_VERTEX)," \
     "CBV(b1)," \
     "SRV(t0, flags = DATA_STATIC)," \
     "SRV(t1, flags = DATA_STATIC_WHILE_SET_AT_EXECUTE)," \
@@ -21,6 +22,7 @@
 #include "Pbr.hlsl"
 
 StructuredBuffer<ParticleSprite> g_Particles : register(t0, space900);
+ByteAddressBuffer g_SortedParticleLookup : register(t1, space900);
 
 //===================================================================================================================================================
 // Vertex shader
@@ -30,6 +32,7 @@ StructuredBuffer<ParticleSprite> g_Particles : register(t0, space900);
 PsInput VsMainParticle(uint vertexId : SV_VertexID, uint particleIndex : SV_InstanceID)
 {
     PsInput result;
+    particleIndex = g_SortedParticleLookup.Load(particleIndex * 8);
     ParticleSprite particle = g_Particles[particleIndex];
 
     result.Uv0 = float2(uint2(vertexId >> 1, vertexId) & 1.xx);
diff --git a/ThreeL/Shaders/ParticleSystem.cs.hlsl b/ThreeL/Shaders/ParticleSystem.cs.hlsl
index 3bcca6f..1935623 100644
--- a/ThreeL/Shaders/ParticleSystem.cs.hlsl
+++ b/ThreeL/Shaders/ParticleSystem.cs.hlsl
@@ -10,6 +10,7 @@ RWStructuredBuffer<ParticleSprite> g_ParticleSpritesOut : register(u1, space900)
 
 RWByteAddressBuffer g_LivingParticleCountOut : register(u2, space900);
 RWByteAddressBuffer g_DrawIndirectArguments : register(u3, space900);
+RWByteAddressBuffer g_ParticleSpriteSortBuffer : register(u4, space900);
 
 struct ParticleSystemParams
 {
@@ -52,6 +53,7 @@ ConstantBuffer<ParticleSystemParams> g_Params : register(b0, space900);
     "UAV(u1, space = 900, flags = DATA_VOLATILE)," \
     "UAV(u2, space = 900, flags = DATA_VOLATILE)," \
     "UAV(u3, space = 900, flags = DATA_VOLATILE)," \
+    "UAV(u4, space = 900, flags = DATA_VOLATILE)," \
     ""
 
 void OutputParticle(uint outputIndex, ParticleState state);
@@ -135,9 +137,13 @@ void OutputParticle(uint outputIndex, ParticleState state)
     g_ParticleStatesOut[outputIndex] = state;
 
     // Create a sprite for this particle
-    //TODO: Emit a sort key
     //TODO: Culling?
     g_ParticleSpritesOut[outputIndex] = MakeSprite(state);
+
+    // Emit sort index/key pair
+    float3 toEye = state.WorldPosition - g_PerFrame.EyePosition;
+    float distanceSquared = dot(toEye, toEye);
+    g_ParticleSpriteSortBuffer.Store2(outputIndex * 8, uint2(outputIndex, asuint(distanceSquared)));
 }
 
 [numthreads(1, 1, 1)]
diff --git a/ThreeL/ThreeL.props b/ThreeL/ThreeL.props
index b6fb6af..0b317c2 100644
--- a/ThreeL/ThreeL.props
+++ b/ThreeL/ThreeL.props
@@ -42,6 +42,7 @@
   -->
   <ItemGroup>
     <_Assets Include="$(ExternalDir)Assets/**" TargetPath="Assets/%(RecursiveDir)/%(Filename)%(Extension)" CopyToOutputDirectory="PreserveNewest" />
+    <_Assets Include="$(ExternalDir)BitonicSort/**" TargetPath="Shaders/BitonicSort/%(RecursiveDir)/%(Filename)%(Extension)" CopyToOutputDirectory="PreserveNewest" />
     <_Assets Include="$(MSBuildThisFileDirectory)Assets/**" TargetPath="Assets/%(RecursiveDir)/%(Filename)%(Extension)" CopyToOutputDirectory="PreserveNewest" />
     <_Assets Include="$(MSBuildThisFileDirectory)Shaders/**" TargetPath="Shaders/%(RecursiveDir)/%(Filename)%(Extension)" CopyToOutputDirectory="PreserveNewest" />
   </ItemGroup>
diff --git a/ThreeL/ThreeL.vcxproj b/ThreeL/ThreeL.vcxproj
index f9c5303..7a8177d 100644
--- a/ThreeL/ThreeL.vcxproj
+++ b/ThreeL/ThreeL.vcxproj
@@ -86,6 +86,7 @@
     <ClCompile Include="..\external\xxhash.c" />
     <ClCompile Include="Assert.cpp" />
     <ClCompile Include="AssetLoading.cpp" />
+    <ClCompile Include="BitonicSort.cpp" />
     <ClCompile Include="CameraController.cpp" />
     <ClCompile Include="CameraInput.cpp" />
     <ClCompile Include="CommandContext.cpp" />
@@ -158,6 +159,7 @@
     <ClInclude Include="Assert.h" />
     <ClInclude Include="AssetLoading.h" />
     <ClInclude Include="BackBuffer.h" />
+    <ClInclude Include="BitonicSort.h" />
     <ClInclude Include="CameraController.h" />
     <ClInclude Include="CameraInput.h" />
     <ClInclude Include="CommandContext.h" />
@@ -219,12 +221,17 @@
     <ClInclude Include="Window.h" />
   </ItemGroup>
   <ItemGroup>
+    <None Include="..\external\BitonicSort\BitonicSortCommon.hlsli" />
     <None Include="packages.config" />
     <None Include="Shaders\Common.hlsli" />
     <None Include="Shaders\ParticleCommon.hlsli" />
     <None Include="Shaders\Random.hlsli" />
   </ItemGroup>
   <ItemGroup>
+    <FxCompile Include="..\external\BitonicSort\BitonicInnerSort.cs.hlsl" />
+    <FxCompile Include="..\external\BitonicSort\BitonicOuterSort.cs.hlsl" />
+    <FxCompile Include="..\external\BitonicSort\BitonicPrepareIndirectArgs.cs.hlsl" />
+    <FxCompile Include="..\external\BitonicSort\BitonicPreSort.cs.hlsl" />
     <FxCompile Include="Shaders\DepthDownsample.ps.hlsl" />
     <FxCompile Include="Shaders\DepthOnly.hlsl" />
     <FxCompile Include="Shaders\FullScreenQuad.vs.hlsl" />
diff --git a/ThreeL/ThreeL.vcxproj.filters b/ThreeL/ThreeL.vcxproj.filters
index 97ffbde..8e7fae2 100644
--- a/ThreeL/ThreeL.vcxproj.filters
+++ b/ThreeL/ThreeL.vcxproj.filters
@@ -90,6 +90,7 @@
     <ClCompile Include="Stopwatch.cpp" />
     <ClCompile Include="ParticleSystemDefinition.cpp" />
     <ClCompile Include="AssetLoading.cpp" />
+    <ClCompile Include="BitonicSort.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="Window.h" />
@@ -206,6 +207,7 @@
     <ClInclude Include="Stopwatch.h" />
     <ClInclude Include="ParticleSystemDefinition.h" />
     <ClInclude Include="AssetLoading.h" />
+    <ClInclude Include="BitonicSort.h" />
   </ItemGroup>
   <ItemGroup>
     <None Include="packages.config" />
@@ -218,6 +220,9 @@
     <None Include="Shaders\Random.hlsli">
       <Filter>Shaders</Filter>
     </None>
+    <None Include="..\external\BitonicSort\BitonicSortCommon.hlsli">
+      <Filter>external\BitonicSort</Filter>
+    </None>
   </ItemGroup>
   <ItemGroup>
     <Filter Include="external">
@@ -232,6 +237,9 @@
     <Filter Include="Math">
       <UniqueIdentifier>{2680a101-7374-4cae-9099-cdea269e6f42}</UniqueIdentifier>
     </Filter>
+    <Filter Include="external\BitonicSort">
+      <UniqueIdentifier>{2406106f-1795-4877-909f-77751fc56038}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <FxCompile Include="Shaders\Pbr.hlsl">
@@ -267,6 +275,18 @@
     <FxCompile Include="Shaders\LightSprites.hlsl">
       <Filter>Shaders</Filter>
     </FxCompile>
+    <FxCompile Include="..\external\BitonicSort\BitonicInnerSort.cs.hlsl">
+      <Filter>external\BitonicSort</Filter>
+    </FxCompile>
+    <FxCompile Include="..\external\BitonicSort\BitonicOuterSort.cs.hlsl">
+      <Filter>external\BitonicSort</Filter>
+    </FxCompile>
+    <FxCompile Include="..\external\BitonicSort\BitonicPreSort.cs.hlsl">
+      <Filter>external\BitonicSort</Filter>
+    </FxCompile>
+    <FxCompile Include="..\external\BitonicSort\BitonicPrepareIndirectArgs.cs.hlsl">
+      <Filter>external\BitonicSort</Filter>
+    </FxCompile>
   </ItemGroup>
   <ItemGroup>
     <Manifest Include="ThreeL.manifest" />
diff --git a/ThreeL/UavCounter.cpp b/ThreeL/UavCounter.cpp
index 6fa7fcc..e514d09 100644
--- a/ThreeL/UavCounter.cpp
+++ b/ThreeL/UavCounter.cpp
@@ -30,7 +30,21 @@ UavCounter::UavCounter(GraphicsCore& graphics, const std::wstring& debugName)
             .Flags = D3D12_BUFFER_UAV_FLAG_RAW,
         },
     };
-    m_Uav = graphics.ResourceDescriptorManager().CreateUnorderedAccessView(m_Resource.Get(), nullptr, uavDescription);;
+    m_Uav = graphics.ResourceDescriptorManager().CreateUnorderedAccessView(m_Resource.Get(), nullptr, uavDescription);
+
+    D3D12_SHADER_RESOURCE_VIEW_DESC srvDescription =
+    {
+        .Format = DXGI_FORMAT_R32_TYPELESS,
+        .ViewDimension = D3D12_SRV_DIMENSION_BUFFER,
+        .Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING,
+        .Buffer =
+        {
+            .FirstElement = 0,
+            .NumElements = 1,
+            .Flags = D3D12_BUFFER_SRV_FLAG_RAW,
+        },
+    };
+    m_Srv = graphics.ResourceDescriptorManager().CreateShaderResourceView(m_Resource.Get(), srvDescription);
 
     m_GpuAddress = m_Resource->GetGPUVirtualAddress();
 }
diff --git a/ThreeL/UavCounter.h b/ThreeL/UavCounter.h
index a7d7c19..0e7cba9 100644
--- a/ThreeL/UavCounter.h
+++ b/ThreeL/UavCounter.h
@@ -13,6 +13,7 @@ class UavCounter : public GpuResource
 {
 private:
     ResourceDescriptor m_Uav;
+    ResourceDescriptor m_Srv;
     D3D12_GPU_VIRTUAL_ADDRESS m_GpuAddress = 0;
 
 public:
@@ -24,5 +25,6 @@ class UavCounter : public GpuResource
     inline operator ID3D12Resource* () const { return Resource(); }
 
     inline ResourceDescriptor Uav() const { return m_Uav; };
+    inline ResourceDescriptor Srv() const { return m_Srv; };
     inline D3D12_GPU_VIRTUAL_ADDRESS GpuAddress() const { return m_GpuAddress; };
 };
diff --git a/external/BitonicSort/BitonicInnerSort.cs.hlsl b/external/BitonicSort/BitonicInnerSort.cs.hlsl
new file mode 100644
index 0000000..3d1b33c
--- /dev/null
+++ b/external/BitonicSort/BitonicInnerSort.cs.hlsl
@@ -0,0 +1,114 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// This code is licensed under the MIT License (MIT).
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+// Developed by Minigraph
+//
+// Author:  James Stanard 
+//
+// Description:  The bitonic sort works by sorting groups of size k,
+// starting with k=2 and doubling until k>=NumItems.  To sort the
+// group, keys are compared with a distance of j, which starts at half
+// of k and continues halving down to 1.  When j is 1024 and less, the
+// compare and swap can happen in LDS, and these iterations form the
+// "inner sort".  Inner sorting happens in LDS and loops.  Outer sorting
+// happens in memory and does not loop.  (Looping happens on the CPU by
+// issuing sequential dispatches and barriers.)
+
+
+#include "BitonicSortCommon.hlsli"
+
+RWByteAddressBuffer g_SortBuffer : register(u0);
+
+cbuffer Constants : register(b0)
+{
+    uint k; // k >= 4096
+};
+
+#ifdef BITONICSORT_64BIT
+
+groupshared uint gs_SortKeys[2048];
+groupshared uint gs_SortIndices[2048];
+
+void LoadKeyIndexPair( uint Element, uint ListCount )
+{
+    uint2 KeyIndex = Element < ListCount ? g_SortBuffer.Load2(Element * 8) : NullItem;
+    gs_SortIndices[Element & 2047] = KeyIndex.x;
+    gs_SortKeys[Element & 2047] = KeyIndex.y;
+}
+
+void StoreKeyIndexPair( uint Element, uint ListCount )
+{
+    if (Element < ListCount)
+        g_SortBuffer.Store2(Element * 8, uint2(gs_SortIndices[Element & 2047], gs_SortKeys[Element & 2047]));
+}
+
+#else // 32-bit packed key/index pairs
+
+groupshared uint gs_SortKeys[2048];
+
+void LoadKeyIndexPair( uint Element, uint ListCount )
+{
+    gs_SortKeys[Element & 2047] = Element < ListCount ? g_SortBuffer.Load(Element * 4) : NullItem;
+}
+
+void StoreKeyIndexPair( uint Element, uint ListCount )
+{
+    if (Element < ListCount)
+        g_SortBuffer.Store(Element * 4, gs_SortKeys[Element & 2047]);
+}
+
+#endif
+
+[RootSignature(BitonicSort_RootSig)]
+[numthreads(1024, 1, 1)]
+void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex )
+{
+    const uint ListCount = g_CounterBuffer.Load(CounterOffset);
+
+    // Item index of the start of this group
+    const uint GroupStart = Gid.x * 2048;
+
+    // Load from memory into LDS to prepare sort
+    LoadKeyIndexPair(GroupStart + GI, ListCount);
+    LoadKeyIndexPair(GroupStart + GI + 1024, ListCount);
+
+    GroupMemoryBarrierWithGroupSync();
+
+    // This is better unrolled because it reduces ALU and because some
+    // architectures can load/store two LDS items in a single instruction
+    // as long as their separation is a compile-time constant.
+    [unroll]
+    for (uint j = 1024; j > 0; j /= 2)
+    {
+        uint Index2 = InsertOneBit(GI, j);
+        uint Index1 = Index2 ^ j;
+
+        uint A = gs_SortKeys[Index1];
+        uint B = gs_SortKeys[Index2];
+
+        if (ShouldSwap(A, B))
+        {
+            // Swap the keys
+            gs_SortKeys[Index1] = B;
+            gs_SortKeys[Index2] = A;
+
+#ifdef BITONICSORT_64BIT
+            // Then swap the indices (for 64-bit sorts)
+            A = gs_SortIndices[Index1];
+            B = gs_SortIndices[Index2];
+            gs_SortIndices[Index1] = B;
+            gs_SortIndices[Index2] = A;
+#endif
+        }
+
+        GroupMemoryBarrierWithGroupSync();
+    }
+
+    StoreKeyIndexPair(GroupStart + GI, ListCount);
+    StoreKeyIndexPair(GroupStart + GI + 1024, ListCount);
+}
diff --git a/external/BitonicSort/BitonicOuterSort.cs.hlsl b/external/BitonicSort/BitonicOuterSort.cs.hlsl
new file mode 100644
index 0000000..88db6e2
--- /dev/null
+++ b/external/BitonicSort/BitonicOuterSort.cs.hlsl
@@ -0,0 +1,55 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// This code is licensed under the MIT License (MIT).
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+// Developed by Minigraph
+//
+// Author:  James Stanard 
+//
+
+#include "BitonicSortCommon.hlsli"
+
+RWByteAddressBuffer g_SortBuffer : register(u0);
+
+cbuffer Constants : register(b0)
+{
+    uint k;    // k >= 4096
+    uint j;    // j >= 2048 && j < k
+};
+
+#ifdef BITONICSORT_64BIT
+    #define Element uint2
+    #define LoadElement(idx) g_SortBuffer.Load2(idx * 8)
+    #define StoreElement(idx, elem) g_SortBuffer.Store2(idx * 8, elem)
+#else
+    #define Element uint
+    #define LoadElement(idx) g_SortBuffer.Load(idx * 4)
+    #define StoreElement(idx, elem) g_SortBuffer.Store(idx * 4, elem)
+#endif
+
+[RootSignature(BitonicSort_RootSig)]
+[numthreads(1024, 1, 1)]
+void main( uint3 DTid : SV_DispatchThreadID  )
+{
+    const uint ListCount = g_CounterBuffer.Load(CounterOffset);
+
+    // Form unique index pair from dispatch thread ID
+    uint Index2 = InsertOneBit(DTid.x, j);
+    uint Index1 = Index2 ^ (k == 2 * j ? k - 1 : j);
+
+    if (Index2 >= ListCount)
+        return;
+
+    Element A = LoadElement(Index1);
+    Element B = LoadElement(Index2);
+
+    if (ShouldSwap(A, B))
+    {
+        StoreElement(Index1, B);
+        StoreElement(Index2, A);
+    }
+}
diff --git a/external/BitonicSort/BitonicPreSort.cs.hlsl b/external/BitonicSort/BitonicPreSort.cs.hlsl
new file mode 100644
index 0000000..dbb5eb9
--- /dev/null
+++ b/external/BitonicSort/BitonicPreSort.cs.hlsl
@@ -0,0 +1,128 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// This code is licensed under the MIT License (MIT).
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+// Developed by Minigraph
+//
+// Author:  James Stanard 
+//
+// Description:  A bitonic sort must eventually sort the power-of-two
+// ceiling of items.  E.g. 391 items -> 512 items.  Because of this
+// "null items" must be used as padding at the end of the list so that
+// they can participate in the sort but remain at the end of the list.
+//
+// The pre-sort does two things.  It appends null items as need, and
+// it does the initial sort for k values up to 2048.  This is because
+// we can run 1024 threads, each of of which can compare and swap two
+// elements without contention.  And because we can always fit 2048
+// keys & indices in LDS with occupancy greater than one.  (A single
+// thread group can use as much as 32KB of LDS.)
+
+
+#include "BitonicSortCommon.hlsli"
+
+RWByteAddressBuffer g_SortBuffer : register(u0);
+
+#ifdef BITONICSORT_64BIT
+
+groupshared uint gs_SortIndices[2048];
+groupshared uint gs_SortKeys[2048];
+
+void FillSortKey( uint Element, uint ListCount )
+{
+    // Unused elements must sort to the end
+    if (Element < ListCount)
+    {
+        uint2 KeyIndexPair = g_SortBuffer.Load2(Element * 8);
+        gs_SortKeys[Element & 2047] = KeyIndexPair.y;
+        gs_SortIndices[Element & 2047] = KeyIndexPair.x;
+    }
+    else
+    {
+        gs_SortKeys[Element & 2047] = NullItem;
+    }
+}
+
+void StoreKeyIndexPair( uint Element, uint ListCount)
+{
+    if (Element < ListCount)
+        g_SortBuffer.Store2(Element * 8, uint2(gs_SortIndices[Element & 2047], gs_SortKeys[Element & 2047]));
+}
+
+#else // 32-bit packed key/index pairs
+
+groupshared uint gs_SortKeys[2048];
+
+void FillSortKey( uint Element, uint ListCount )
+{
+    // Unused elements must sort to the end
+    gs_SortKeys[Element & 2047] = (Element < ListCount ? g_SortBuffer.Load(Element * 4) : NullItem);
+}
+
+void StoreKeyIndexPair( uint Element, uint ListCount )
+{
+    if (Element < ListCount)
+        g_SortBuffer.Store(Element * 4, gs_SortKeys[Element & 2047]);
+}
+
+#endif
+
+[RootSignature(BitonicSort_RootSig)]
+[numthreads(1024, 1, 1)]
+void main( uint3 Gid : SV_GroupID, uint GI : SV_GroupIndex )
+{
+    // Item index of the start of this group
+    const uint GroupStart = Gid.x * 2048;
+
+    // Actual number of items that need sorting
+    const uint ListCount = g_CounterBuffer.Load(CounterOffset);
+
+    FillSortKey(GroupStart + GI, ListCount);
+    FillSortKey(GroupStart + GI + 1024, ListCount);
+
+    GroupMemoryBarrierWithGroupSync();
+
+    uint k;
+
+    // This is better unrolled because it reduces ALU and because some
+    // architectures can load/store two LDS items in a single instruction
+    // as long as their separation is a compile-time constant.
+    [unroll]
+    for (k = 2; k <= 2048; k <<= 1)
+    {
+        //[unroll]
+        for (uint j = k / 2; j > 0; j /= 2)
+        {
+            uint Index2 = InsertOneBit(GI, j);
+            uint Index1 = Index2 ^ (k == 2 * j ? k - 1 : j);
+
+            uint A = gs_SortKeys[Index1];
+            uint B = gs_SortKeys[Index2];
+
+            if (ShouldSwap(A, B))
+            {
+                // Swap the keys
+                gs_SortKeys[Index1] = B;
+                gs_SortKeys[Index2] = A;
+
+#ifdef BITONICSORT_64BIT
+                // Then swap the indices (for 64-bit sorts)
+                A = gs_SortIndices[Index1];
+                B = gs_SortIndices[Index2];
+                gs_SortIndices[Index1] = B;
+                gs_SortIndices[Index2] = A;
+#endif
+            }
+
+            GroupMemoryBarrierWithGroupSync();
+        }
+    }
+
+    // Write sorted results to memory
+    StoreKeyIndexPair(GroupStart + GI, ListCount);
+    StoreKeyIndexPair(GroupStart + GI + 1024, ListCount);
+}
diff --git a/external/BitonicSort/BitonicPrepareIndirectArgs.cs.hlsl b/external/BitonicSort/BitonicPrepareIndirectArgs.cs.hlsl
new file mode 100644
index 0000000..b5504ad
--- /dev/null
+++ b/external/BitonicSort/BitonicPrepareIndirectArgs.cs.hlsl
@@ -0,0 +1,62 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// This code is licensed under the MIT License (MIT).
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+// Developed by Minigraph
+//
+// Author:  James Stanard 
+//
+
+#include "BitonicSortCommon.hlsli"
+
+RWByteAddressBuffer g_IndirectArgsBuffer : register(u0);
+
+cbuffer Constants : register(b0)
+{
+    uint MaxIterations;
+}
+
+uint NextPow2( uint Val )
+{
+    uint Mask = (1u << firstbithigh(Val)) - 1;
+    return (Val + Mask) & ~Mask;
+}
+
+[RootSignature(BitonicSort_RootSig)]
+[numthreads(22, 1, 1)]
+void main( uint GI : SV_GroupIndex )
+{
+    if (GI >= MaxIterations)
+        return;
+
+    uint ListCount = g_CounterBuffer.Load(CounterOffset);
+    uint k = 2048u << GI;
+
+    // We need one more iteration every time the number of thread groups doubles
+    if (k > NextPow2((ListCount + 2047) & ~2047))
+        ListCount = 0;
+
+    uint PrevDispatches = GI * (GI + 1) / 2;
+    uint Offset = 12 * PrevDispatches;
+
+    // Generate outer sort dispatch arguments
+    for (uint j = k / 2; j > 1024; j /= 2)
+    {
+        // All of the groups of size 2j that are full
+        uint CompleteGroups = (ListCount & ~(2 * j - 1)) / 2048;
+
+        // Remaining items must only be sorted if there are more than j of them
+        uint PartialGroups = ((uint)max(int(ListCount - CompleteGroups * 2048 - j), 0) + 1023) / 1024;
+
+        g_IndirectArgsBuffer.Store3(Offset, uint3(CompleteGroups + PartialGroups, 1, 1));
+
+        Offset += 12;
+    }
+
+    // The inner sort always sorts all groups (rounded up to multiples of 2048)
+    g_IndirectArgsBuffer.Store3(Offset, uint3((ListCount + 2047) / 2048, 1, 1));
+}
diff --git a/external/BitonicSort/BitonicSortCommon.hlsli b/external/BitonicSort/BitonicSortCommon.hlsli
new file mode 100644
index 0000000..7dee18e
--- /dev/null
+++ b/external/BitonicSort/BitonicSortCommon.hlsli
@@ -0,0 +1,59 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// This code is licensed under the MIT License (MIT).
+// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
+// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
+// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
+// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
+//
+// Developed by Minigraph
+//
+// Author:  James Stanard 
+//
+
+#define BitonicSort_RootSig \
+    "RootFlags(0), " \
+    "RootConstants(b0, num32BitConstants = 2)," \
+    "DescriptorTable(SRV(t0, numDescriptors = 1))," \
+    "DescriptorTable(UAV(u0, numDescriptors = 1))," \
+    "RootConstants(b1, num32BitConstants = 2)"
+
+ByteAddressBuffer g_CounterBuffer : register(t0);
+
+cbuffer CB1 : register(b1)
+{
+    // Offset into counter buffer where this list's item count is stored
+    uint CounterOffset;
+
+    // A sort key that will end up at the end of the list; to be used to pad
+    // lists in LDS (always 2048 items).
+    //   Descending:  0x00000000
+    //   Ascending:   0xffffffff
+    // Also used by the ShouldSwap() function to invert ordering.
+    uint NullItem; 
+}
+
+// Takes Value and widens it by one bit at the location of the bit
+// in the mask.  A one is inserted in the space.  OneBitMask must
+// have one and only one bit set.
+uint InsertOneBit( uint Value, uint OneBitMask )
+{
+    uint Mask = OneBitMask - 1;
+    return (Value & ~Mask) << 1 | (Value & Mask) | OneBitMask;
+}
+
+// Determines if two sort keys should be swapped in the list.  NullItem is
+// either 0 or 0xffffffff.  XOR with the NullItem will either invert the bits
+// (effectively a negation) or leave the bits alone.  When the the NullItem is
+// 0, we are sorting descending, so when A < B, they should swap.  For an
+// ascending sort, ~A < ~B should swap.
+bool ShouldSwap(uint A, uint B)
+{
+    return (A ^ NullItem) < (B ^ NullItem);
+}
+
+// Same as above, but only compares the upper 32-bit word.
+bool ShouldSwap(uint2 A, uint2 B)
+{
+    return (A.y ^ NullItem) < (B.y ^ NullItem);
+}
diff --git a/external/README.md b/external/README.md
new file mode 100644
index 0000000..329acda
--- /dev/null
+++ b/external/README.md
@@ -0,0 +1,29 @@
+ThreeL External Dependencies
+===============================================================================
+
+Most of ThreeL's dependencies are incorporated into this repository directly for the sake of simplicity (a few others come from NuGet.)
+
+They're taken from the following versions:
+
+| Dependency | File/Directory | Upstream Version | Modified |
+|------------|----------------|------------------|---------------|
+| Dear ImGui | `DearImGui/` | [`docking a88e5be7f4`](https://github.com/ocornut/imgui/tree/a88e5be7f478233e74c72c72eabb1d5f1cb69bb5) | Yes
+| JSON for Modern C++ | `json.hpp` | [`v3.10.4`](https://github.com/nlohmann/json/tree/fec56a1a16c6e1c1b1f4e116a20e79398282626c) |
+| Minigraph Bitonic Sort | `BitonicSort/` | [`a79e01c4c3`](https://github.com/microsoft/DirectX-Graphics-Samples/tree/a79e01c4c39e6d40f4b078688ff95814d166d34f) | Yes
+| Sponza | `Assets/Sponza/` | [`189f80d7d4`](https://github.com/KhronosGroup/glTF-Sample-Models/tree/189f80d7d44f76d8f9be8e337d4c6cb85ef521a4) |
+| stb_image | `stb_image.h` | [`TinyGLTF fork v2.8.9`](https://github.com/syoyo/tinygltf/tree/350c2968025882bdf823e7892d02328548b46435) |
+| TinyGLTF | `tiny_gltf.h` | [`v2.8.9`](https://github.com/syoyo/tinygltf/tree/350c2968025882bdf823e7892d02328548b46435) |
+| xxHash | `xxhash.c`/`xxhash.h` | [`0656ed7539`](https://github.com/Cyan4973/xxHash/tree/0656ed753994ce3d04a39ca132242e98fddef136) |
+
+See [the third-party notice listing](../THIRD-PARTY-NOTICES.md) for details on licenses governing these dependencies.
+
+## Changes made
+
+## Dear ImGui
+
+* Added names to Direct3D 12 objects created by the backend
+
+## Minigraph Bitonic Sort
+
+* Renamed files
+* Fixed some compiler warnings in `BitonicPrepareIndirectArgs.cs.hlsl`
diff --git a/tooling/Common.targets b/tooling/Common.targets
index 4b0d495..61a682d 100644
--- a/tooling/Common.targets
+++ b/tooling/Common.targets
@@ -21,8 +21,9 @@
     <ClCompile Update="$(MSBuildThisFileDirectory)../external/**/*">
       <PrecompiledHeader>NotUsing</PrecompiledHeader>
     </ClCompile>
-    <!-- Don't use legacy HLSL compiler -->
+    <!-- Don't use MSBuild's HLSL compilation -->
     <FxCompile Update="**" ExcludedFromBuild="true" />
+    <FxCompile Update="$(MSBuildThisFileDirectory)../external/**/*" ExcludedFromBuild="true" />
   </ItemGroup>
 
   <!-- Automatically disable runtime checks when ASAN is enabled -->