From 07cee43a2bb0878b178d850b96981c5f661aa5ac Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 3 Nov 2024 19:50:27 +0200
Subject: [PATCH 01/14] HLE DSP: Implement per-voice mixing stage

---
 include/audio/hle_core.hpp  | 14 +++++++++++---
 src/core/audio/hle_core.cpp | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 3 deletions(-)
diff --git a/include/audio/hle_core.hpp b/include/audio/hle_core.hpp
index bd7172379..d05e9808e 100644
--- a/include/audio/hle_core.hpp
+++ b/include/audio/hle_core.hpp
@@ -47,14 +47,17 @@ namespace Audio {
 
 		// Buffer of decoded PCM16 samples. TODO: Are there better alternatives to use over deque?
 		using SampleBuffer = std::deque<std::array<s16, 2>>;
-
 		using BufferQueue = std::priority_queue<Buffer>;
+
 		BufferQueue buffers;
 
 		SampleFormat sampleFormat = SampleFormat::ADPCM;
 		SourceType sourceType = SourceType::Stereo;
 
-		std::array<float, 3> gain0, gain1, gain2;
+		// There's one gain configuration for each of the 3 intermediate mixing stages
+		// And each gain configuration is composed of 4 gain values, one for each sample in a quad-channel sample
+		std::array<std::array<float, 4>, 3> gains;
+
 		u32 samplePosition;  // Sample number into the current audio buffer
 		float rateMultiplier;
 		u16 syncCount;
@@ -112,6 +115,10 @@ namespace Audio {
 		template <typename T>
 		using QuadFrame = Frame<T, 4>;
 
+		// Internally the DSP uses four channels when mixing.
+		// Neatly, QuadFrame<s32> means that every sample is a uint32x4 value, which is particularly nice for SIMD mixing
+		using IntermediateMix = QuadFrame<s32>;
+
 	  private:
 		using ChannelFormat = HLE::DspConfiguration::OutputFormat;
 		// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
@@ -151,7 +158,8 @@ namespace Audio {
 
 		using Source = Audio::DSPSource;
 		using SampleBuffer = Source::SampleBuffer;
-
+		using IntermediateMix = DSPMixer::IntermediateMix;
+		
 	  private:
 		enum class DSPState : u32 {
 			Off,
diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp
index 85eee97a5..9870adef1 100644
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@@ -228,6 +228,7 @@ namespace Audio {
 		// The DSP checks the DSP configuration dirty bits on every frame, applies them, and clears them
 		read.dspConfiguration.dirtyRaw = 0;
 		read.dspConfiguration.dirtyRaw2 = 0;
+		std::array<IntermediateMix, 3> mixes{};
 
 		for (int i = 0; i < sourceCount; i++) {
 			// Update source configuration from the read region of shared memory
@@ -250,6 +251,24 @@ namespace Audio {
 			status.samplePosition = source.samplePosition;
 
 			source.isBufferIDDirty = false;
+
+			// If the source is still enabled, mix its output into the intermediate mix buffers
+			if (source.enabled) {
+				for (int mix = 0; mix < mixes.size(); mix++) {
+					IntermediateMix& intermediateMix = mixes[mix];
+					const std::array<float, 4>& gains = source.gains[mix];
+
+					// TODO: SIMD implementations
+					for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) {
+						// Mono samples are in the format: (l, r)
+						// When converting to quad, gain0 and gain2 are applied to the left sample, gain1 and gain3 to the right one
+						intermediateMix[sampleIndex][0] += s32(source.currentSamples[sampleIndex][0] * gains[0]);
+						intermediateMix[sampleIndex][1] += s32(source.currentSamples[sampleIndex][1] * gains[1]);
+						intermediateMix[sampleIndex][2] += s32(source.currentSamples[sampleIndex][0] * gains[2]);
+						intermediateMix[sampleIndex][3] += s32(source.currentSamples[sampleIndex][1] * gains[3]);
+					}
+				}
+			}
 		}
 
 		performMix(read, write);
@@ -374,6 +393,21 @@ namespace Audio {
 			}
 		}
 
+#define CONFIG_GAIN(index)                 \
+	if (config.gain##index##Dirty) {       \
+		auto& dest = source.gains[index];  \
+		auto& source = config.gain[index]; \
+                                           \
+		dest[0] = float(source[0]);        \
+		dest[1] = float(source[1]);        \
+		dest[2] = float(source[2]);        \
+		dest[3] = float(source[3]);        \
+	}
+		CONFIG_GAIN(0);
+		CONFIG_GAIN(1);
+		CONFIG_GAIN(2);
+#undef CONFIG_GAIN
+
 		config.dirtyRaw = 0;
 	}
 

From b299609a9b938ad29694d4346f2e6f5d1290bddb Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Wed, 6 Nov 2024 19:26:57 +0200
Subject: [PATCH 02/14] More HLE DSP work

---
 include/audio/hle_core.hpp  | 85 +++++++++++++++++++------------------
 src/core/audio/hle_core.cpp | 23 +++++++---
 2 files changed, 60 insertions(+), 48 deletions(-)

diff --git a/include/audio/hle_core.hpp b/include/audio/hle_core.hpp
index d05e9808e..2fccd55da 100644
--- a/include/audio/hle_core.hpp
+++ b/include/audio/hle_core.hpp
@@ -16,6 +16,46 @@ namespace Audio {
 	using SampleFormat = HLE::SourceConfiguration::Configuration::Format;
 	using SourceType = HLE::SourceConfiguration::Configuration::MonoOrStereo;
 
+	class DSPMixer {
+	  public:
+		template <typename T, usize channelCount = 1>
+		using Sample = std::array<T, channelCount>;
+
+		template <typename T, usize channelCount>
+		using Frame = std::array<Sample<T, channelCount>, 160>;
+
+		template <typename T>
+		using MonoFrame = Frame<T, 1>;
+
+		template <typename T>
+		using StereoFrame = Frame<T, 2>;
+
+		template <typename T>
+		using QuadFrame = Frame<T, 4>;
+
+		// Internally the DSP uses four channels when mixing.
+		// Neatly, QuadFrame<s32> means that every sample is a uint32x4 value, which is particularly nice for SIMD mixing
+		using IntermediateMix = QuadFrame<s32>;
+
+	  private:
+		using ChannelFormat = HLE::DspConfiguration::OutputFormat;
+		// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
+		// Two of these intermediate mixers (second and third) are used for effects, including custom effects done on the CPU
+		static constexpr usize mixerStageCount = 3;
+
+	  public:
+		ChannelFormat channelFormat = ChannelFormat::Stereo;
+		std::array<float, mixerStageCount> volumes;
+		std::array<bool, 2> enableAuxStages;
+
+		void reset() {
+			channelFormat = ChannelFormat::Stereo;
+
+			volumes.fill(0.0);
+			enableAuxStages.fill(false);
+		}
+	};
+
 	struct DSPSource {
 		// Audio buffer information
 		// https://www.3dbrew.org/wiki/DSP_Memory_Region
@@ -49,6 +89,7 @@ namespace Audio {
 		using SampleBuffer = std::deque<std::array<s16, 2>>;
 		using BufferQueue = std::priority_queue<Buffer>;
 
+		DSPMixer::StereoFrame<s16> currentFrame;
 		BufferQueue buffers;
 
 		SampleFormat sampleFormat = SampleFormat::ADPCM;
@@ -98,46 +139,6 @@ namespace Audio {
 		DSPSource() { reset(); }
 	};
 
-	class DSPMixer {
-	  public:
-		template <typename T, usize channelCount = 1>
-		using Sample = std::array<T, channelCount>;
-
-		template <typename T, usize channelCount>
-		using Frame = std::array<Sample<T, channelCount>, 160>;
-
-		template <typename T>
-		using MonoFrame = Frame<T, 1>;
-
-		template <typename T>
-		using StereoFrame = Frame<T, 2>;
-
-		template <typename T>
-		using QuadFrame = Frame<T, 4>;
-
-		// Internally the DSP uses four channels when mixing.
-		// Neatly, QuadFrame<s32> means that every sample is a uint32x4 value, which is particularly nice for SIMD mixing
-		using IntermediateMix = QuadFrame<s32>;
-
-	  private:
-		using ChannelFormat = HLE::DspConfiguration::OutputFormat;
-		// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
-		// Two of these intermediate mixers (second and third) are used for effects, including custom effects done on the CPU
-		static constexpr usize mixerStageCount = 3;
-
-	  public:
-		ChannelFormat channelFormat = ChannelFormat::Stereo;
-		std::array<float, mixerStageCount> volumes;
-		std::array<bool, 2> enableAuxStages;
-
-		void reset() {
-			channelFormat = ChannelFormat::Stereo;
-
-			volumes.fill(0.0);
-			enableAuxStages.fill(false);
-		}
-	};
-
 	class HLE_DSP : public DSPCore {
 		// The audio frame types are public in case we want to use them for unit tests
 	  public:
@@ -159,7 +160,7 @@ namespace Audio {
 		using Source = Audio::DSPSource;
 		using SampleBuffer = Source::SampleBuffer;
 		using IntermediateMix = DSPMixer::IntermediateMix;
-		
+
 	  private:
 		enum class DSPState : u32 {
 			Off,
@@ -226,7 +227,7 @@ namespace Audio {
 		void outputFrame();
 		// Perform the final mix, mixing the quadraphonic samples from all voices into the output audio frame
 		void performMix(Audio::HLE::SharedMemory& readRegion, Audio::HLE::SharedMemory& writeRegion);
-		
+
 		// Decode an entire buffer worth of audio
 		void decodeBuffer(DSPSource& source);
 
diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp
index 9870adef1..4bc548dcc 100644
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@@ -262,10 +262,10 @@ namespace Audio {
 					for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) {
 						// Mono samples are in the format: (l, r)
 						// When converting to quad, gain0 and gain2 are applied to the left sample, gain1 and gain3 to the right one
-						intermediateMix[sampleIndex][0] += s32(source.currentSamples[sampleIndex][0] * gains[0]);
-						intermediateMix[sampleIndex][1] += s32(source.currentSamples[sampleIndex][1] * gains[1]);
-						intermediateMix[sampleIndex][2] += s32(source.currentSamples[sampleIndex][0] * gains[2]);
-						intermediateMix[sampleIndex][3] += s32(source.currentSamples[sampleIndex][1] * gains[3]);
+						intermediateMix[sampleIndex][0] += s32(source.currentFrame[sampleIndex][0] * gains[0]);
+						intermediateMix[sampleIndex][1] += s32(source.currentFrame[sampleIndex][1] * gains[1]);
+						intermediateMix[sampleIndex][2] += s32(source.currentFrame[sampleIndex][0] * gains[2]);
+						intermediateMix[sampleIndex][3] += s32(source.currentFrame[sampleIndex][1] * gains[3]);
 					}
 				}
 			}
@@ -467,6 +467,9 @@ namespace Audio {
 	}
 
 	void HLE_DSP::generateFrame(DSPSource& source) {
+		// Zero out all output samples at first. TODO: Don't zero out the entire frame initially, rather only zero-out the "unwritten" samples when the frame is done being processed.
+		source.currentFrame = {};
+
 		if (source.currentSamples.empty()) {
 			// There's no audio left to play, turn the voice off
 			if (source.buffers.empty()) {
@@ -480,7 +483,7 @@ namespace Audio {
 
 			decodeBuffer(source);
 		} else {
-			uint maxSampleCount = uint(float(Audio::samplesInFrame) * source.rateMultiplier);
+			uint maxSampleCount = uint(float(Audio::samplesInFrame) * 1.0);
 			uint outputCount = 0;
 
 			while (outputCount < maxSampleCount) {
@@ -494,8 +497,14 @@ namespace Audio {
 
 				const uint sampleCount = std::min<s32>(maxSampleCount - outputCount, source.currentSamples.size());
 
-				// samples.insert(samples.end(), source.currentSamples.begin(), source.currentSamples.begin() + sampleCount);
+				// Copy samples to current frame buffer
+				// TODO: Implement linear/polyphase interpolation
+				std::copy(
+					source.currentSamples.begin(), std::next(source.currentSamples.begin(), sampleCount), source.currentFrame.begin() + outputCount
+				);
+				// Remove samples from sample buffer
 				source.currentSamples.erase(source.currentSamples.begin(), std::next(source.currentSamples.begin(), sampleCount));
+				// Advance sample position
 				source.samplePosition += sampleCount;
 				outputCount += sampleCount;
 			}
@@ -718,5 +727,7 @@ namespace Audio {
 
 		buffers = {};
 		currentSamples.clear();
+
+		gains.fill({});
 	}
 }  // namespace Audio

From 8cfffb8119f4e78715d929aedbe304df235daaef Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Thu, 7 Nov 2024 22:08:28 +0200
Subject: [PATCH 03/14] HLE DSP: Actually parse InterpolationMode config

---
 include/audio/hle_core.hpp  | 2 ++
 src/core/audio/hle_core.cpp | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/include/audio/hle_core.hpp b/include/audio/hle_core.hpp
index 2fccd55da..29fd45426 100644
--- a/include/audio/hle_core.hpp
+++ b/include/audio/hle_core.hpp
@@ -88,12 +88,14 @@ namespace Audio {
 		// Buffer of decoded PCM16 samples. TODO: Are there better alternatives to use over deque?
 		using SampleBuffer = std::deque<std::array<s16, 2>>;
 		using BufferQueue = std::priority_queue<Buffer>;
+		using InterpolationMode = HLE::SourceConfiguration::Configuration::InterpolationMode;
 
 		DSPMixer::StereoFrame<s16> currentFrame;
 		BufferQueue buffers;
 
 		SampleFormat sampleFormat = SampleFormat::ADPCM;
 		SourceType sourceType = SourceType::Stereo;
+		InterpolationMode interpolationMode = InterpolationMode::Linear;
 
 		// There's one gain configuration for each of the 3 intermediate mixing stages
 		// And each gain configuration is composed of 4 gain values, one for each sample in a quad-channel sample
diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp
index 4bc548dcc..06e001f11 100644
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@@ -319,6 +319,10 @@ namespace Audio {
 			source.sourceType = config.monoOrStereo;
 		}
 
+		if (config.interpolationDirty) {
+			source.interpolationMode = config.interpolationMode;
+		}
+
 		if (config.rateMultiplierDirty) {
 			source.rateMultiplier = (config.rateMultiplier > 0.f) ? config.rateMultiplier : 1.f;
 		}
@@ -499,6 +503,7 @@ namespace Audio {
 
 				// Copy samples to current frame buffer
 				// TODO: Implement linear/polyphase interpolation
+
 				std::copy(
 					source.currentSamples.begin(), std::next(source.currentSamples.begin(), sampleCount), source.currentFrame.begin() + outputCount
 				);
@@ -718,6 +723,7 @@ namespace Audio {
 		// Initialize these to some sane defaults
 		sampleFormat = SampleFormat::ADPCM;
 		sourceType = SourceType::Stereo;
+		interpolationMode = InterpolationMode::Linear;
 
 		samplePosition = 0;
 		previousBufferID = 0;

From 69e8e1c2c46f848c5b5a58c871c812184d61123f Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sat, 9 Nov 2024 23:11:19 +0200
Subject: [PATCH 04/14] Add audio interpolation helpers

---
 CMakeLists.txt                         |  2 +
 include/audio/audio_interpolation.hpp  | 58 ++++++++++++++++++++
 include/audio/hle_core.hpp             | 47 ++---------------
 include/audio/hle_mixer.hpp            | 50 ++++++++++++++++++
 src/core/audio/audio_interpolation.cpp | 73 ++++++++++++++++++++++++++
 src/core/audio/hle_core.cpp            |  1 +
 6 files changed, 188 insertions(+), 43 deletions(-)
 create mode 100644 include/audio/audio_interpolation.hpp
 create mode 100644 include/audio/hle_mixer.hpp
 create mode 100644 src/core/audio/audio_interpolation.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74fafc04c..3193701d8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -294,6 +294,7 @@ set(APPLET_SOURCE_FILES src/core/applets/applet.cpp src/core/applets/mii_selecto
 )
 set(AUDIO_SOURCE_FILES src/core/audio/dsp_core.cpp src/core/audio/null_core.cpp src/core/audio/teakra_core.cpp
                        src/core/audio/miniaudio_device.cpp src/core/audio/hle_core.cpp src/core/audio/aac_decoder.cpp
+                       src/core/audio/audio_interpolation.cpp
 )
 set(RENDERER_SW_SOURCE_FILES src/core/renderer_sw/renderer_sw.cpp)
 
@@ -334,6 +335,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                  include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
                  include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp
                  include/align.hpp include/audio/aac_decoder.hpp include/PICA/pica_simd.hpp include/services/fonts.hpp
+                 include/audio/audio_interpolation.hpp include/audio/hle_mixer.hpp
 )
 
 cmrc_add_resource_library(
diff --git a/include/audio/audio_interpolation.hpp b/include/audio/audio_interpolation.hpp
new file mode 100644
index 000000000..8a87cbcd2
--- /dev/null
+++ b/include/audio/audio_interpolation.hpp
@@ -0,0 +1,58 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <deque>
+
+#include "audio/hle_mixer.hpp"
+#include "helpers.hpp"
+
+namespace Audio::Interpolation {
+	// A variable length buffer of signed PCM16 stereo samples.
+	using StereoBuffer16 = std::deque<std::array<s16, 2>>;
+	using StereoFrame16 = Audio::DSPMixer::StereoFrame<s16>;
+
+	struct State {
+		// Two historical samples.
+		std::array<s16, 2> xn1 = {};  //< x[n-1]
+		std::array<s16, 2> xn2 = {};  //< x[n-2]
+		// Current fractional position.
+		u64 fposition = 0;
+	};
+
+	/**
+	 * No interpolation. This is equivalent to a zero-order hold. There is a two-sample predelay.
+	 * @param state Interpolation state.
+	 * @param input Input buffer.
+	 * @param rate Stretch factor. Must be a positive non-zero value.
+	 *             rate > 1.0 performs decimation and rate < 1.0 performs upsampling.
+	 * @param output The resampled audio buffer.
+	 * @param outputi The index of output to start writing to.
+	 */
+	void none(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi);
+
+	/**
+	 * Linear interpolation. This is equivalent to a first-order hold. There is a two-sample predelay.
+	 * @param state Interpolation state.
+	 * @param input Input buffer.
+	 * @param rate Stretch factor. Must be a positive non-zero value.
+	 *             rate > 1.0 performs decimation and rate < 1.0 performs upsampling.
+	 * @param output The resampled audio buffer.
+	 * @param outputi The index of output to start writing to.
+	 */
+	void linear(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi);
+
+	/**
+	 * Polyphase interpolation. This is currently stubbed to just perform linear interpolation
+	 * @param state Interpolation state.
+	 * @param input Input buffer.
+	 * @param rate Stretch factor. Must be a positive non-zero value.
+	 *             rate > 1.0 performs decimation and rate < 1.0 performs upsampling.
+	 * @param output The resampled audio buffer.
+	 * @param outputi The index of output to start writing to.
+	 */
+	void polyphase(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi);
+}  // namespace Audio::Interpolation
\ No newline at end of file
diff --git a/include/audio/hle_core.hpp b/include/audio/hle_core.hpp
index 29fd45426..b3832d76b 100644
--- a/include/audio/hle_core.hpp
+++ b/include/audio/hle_core.hpp
@@ -8,54 +8,13 @@
 
 #include "audio/aac.hpp"
 #include "audio/aac_decoder.hpp"
+#include "audio/audio_interpolation.hpp"
 #include "audio/dsp_core.hpp"
 #include "audio/dsp_shared_mem.hpp"
+#include "audio/hle_mixer.hpp"
 #include "memory.hpp"
 
 namespace Audio {
-	using SampleFormat = HLE::SourceConfiguration::Configuration::Format;
-	using SourceType = HLE::SourceConfiguration::Configuration::MonoOrStereo;
-
-	class DSPMixer {
-	  public:
-		template <typename T, usize channelCount = 1>
-		using Sample = std::array<T, channelCount>;
-
-		template <typename T, usize channelCount>
-		using Frame = std::array<Sample<T, channelCount>, 160>;
-
-		template <typename T>
-		using MonoFrame = Frame<T, 1>;
-
-		template <typename T>
-		using StereoFrame = Frame<T, 2>;
-
-		template <typename T>
-		using QuadFrame = Frame<T, 4>;
-
-		// Internally the DSP uses four channels when mixing.
-		// Neatly, QuadFrame<s32> means that every sample is a uint32x4 value, which is particularly nice for SIMD mixing
-		using IntermediateMix = QuadFrame<s32>;
-
-	  private:
-		using ChannelFormat = HLE::DspConfiguration::OutputFormat;
-		// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
-		// Two of these intermediate mixers (second and third) are used for effects, including custom effects done on the CPU
-		static constexpr usize mixerStageCount = 3;
-
-	  public:
-		ChannelFormat channelFormat = ChannelFormat::Stereo;
-		std::array<float, mixerStageCount> volumes;
-		std::array<bool, 2> enableAuxStages;
-
-		void reset() {
-			channelFormat = ChannelFormat::Stereo;
-
-			volumes.fill(0.0);
-			enableAuxStages.fill(false);
-		}
-	};
-
 	struct DSPSource {
 		// Audio buffer information
 		// https://www.3dbrew.org/wiki/DSP_Memory_Region
@@ -89,6 +48,7 @@ namespace Audio {
 		using SampleBuffer = std::deque<std::array<s16, 2>>;
 		using BufferQueue = std::priority_queue<Buffer>;
 		using InterpolationMode = HLE::SourceConfiguration::Configuration::InterpolationMode;
+        using InterpolationState = Audio::Interpolation::State;
 
 		DSPMixer::StereoFrame<s16> currentFrame;
 		BufferQueue buffers;
@@ -96,6 +56,7 @@ namespace Audio {
 		SampleFormat sampleFormat = SampleFormat::ADPCM;
 		SourceType sourceType = SourceType::Stereo;
 		InterpolationMode interpolationMode = InterpolationMode::Linear;
+        InterpolationState interpolationState;
 
 		// There's one gain configuration for each of the 3 intermediate mixing stages
 		// And each gain configuration is composed of 4 gain values, one for each sample in a quad-channel sample
diff --git a/include/audio/hle_mixer.hpp b/include/audio/hle_mixer.hpp
new file mode 100644
index 000000000..ed8b4a098
--- /dev/null
+++ b/include/audio/hle_mixer.hpp
@@ -0,0 +1,50 @@
+#pragma once
+#include <array>
+
+#include "audio/dsp_shared_mem.hpp"
+#include "helpers.hpp"
+
+namespace Audio {
+	using SampleFormat = HLE::SourceConfiguration::Configuration::Format;
+	using SourceType = HLE::SourceConfiguration::Configuration::MonoOrStereo;
+
+	class DSPMixer {
+	  public:
+		template <typename T, usize channelCount = 1>
+		using Sample = std::array<T, channelCount>;
+
+		template <typename T, usize channelCount>
+		using Frame = std::array<Sample<T, channelCount>, 160>;
+
+		template <typename T>
+		using MonoFrame = Frame<T, 1>;
+
+		template <typename T>
+		using StereoFrame = Frame<T, 2>;
+
+		template <typename T>
+		using QuadFrame = Frame<T, 4>;
+
+		// Internally the DSP uses four channels when mixing.
+		// Neatly, QuadFrame<s32> means that every sample is a uint32x4 value, which is particularly nice for SIMD mixing
+		using IntermediateMix = QuadFrame<s32>;
+
+	  private:
+		using ChannelFormat = HLE::DspConfiguration::OutputFormat;
+		// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
+		// Two of these intermediate mixers (second and third) are used for effects, including custom effects done on the CPU
+		static constexpr usize mixerStageCount = 3;
+
+	  public:
+		ChannelFormat channelFormat = ChannelFormat::Stereo;
+		std::array<float, mixerStageCount> volumes;
+		std::array<bool, 2> enableAuxStages;
+
+		void reset() {
+			channelFormat = ChannelFormat::Stereo;
+
+			volumes.fill(0.0);
+			enableAuxStages.fill(false);
+		}
+	};
+}  // namespace Audio
\ No newline at end of file
diff --git a/src/core/audio/audio_interpolation.cpp b/src/core/audio/audio_interpolation.cpp
new file mode 100644
index 000000000..d13c786ee
--- /dev/null
+++ b/src/core/audio/audio_interpolation.cpp
@@ -0,0 +1,73 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "audio/audio_interpolation.hpp"
+
+#include <algorithm>
+
+#include "helpers.hpp"
+
+namespace Audio::Interpolation {
+	// Calculations are done in fixed point with 24 fractional bits.
+	// (This is not verified. This was chosen for minimal error.)
+	static constexpr u64 scaleFactor = 1 << 24;
+	static constexpr u64 scaleMask = scaleFactor - 1;
+
+	/// Here we step over the input in steps of rate, until we consume all of the input.
+	/// Three adjacent samples are passed to fn each step.
+	template <typename Function>
+	static void stepOverSamples(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi, Function fn) {
+		if (input.empty()) {
+			return;
+		}
+
+		input.insert(input.begin(), {state.xn2, state.xn1});
+
+		const u64 step_size = static_cast<u64>(rate * scaleFactor);
+		u64 fposition = state.fposition;
+		usize inputi = 0;
+
+		while (outputi < output.size()) {
+			inputi = static_cast<usize>(fposition / scaleFactor);
+
+			if (inputi + 2 >= input.size()) {
+				inputi = input.size() - 2;
+				break;
+			}
+
+			u64 fraction = fposition & scaleMask;
+			output[outputi++] = fn(fraction, input[inputi], input[inputi + 1], input[inputi + 2]);
+
+			fposition += step_size;
+		}
+
+		state.xn2 = input[inputi];
+		state.xn1 = input[inputi + 1];
+		state.fposition = fposition - inputi * scaleFactor;
+
+		input.erase(input.begin(), std::next(input.begin(), inputi + 2));
+	}
+
+	void none(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi) {
+		stepOverSamples(state, input, rate, output, outputi, [](u64 fraction, const auto& x0, const auto& x1, const auto& x2) { return x0; });
+	}
+
+	void linear(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi) {
+		// Note on accuracy: Some values that this produces are +/- 1 from the actual firmware.
+		stepOverSamples(state, input, rate, output, outputi, [](u64 fraction, const auto& x0, const auto& x1, const auto& x2) {
+			// This is a saturated subtraction. (Verified by black-box fuzzing.)
+			s64 delta0 = std::clamp<s64>(x1[0] - x0[0], -32768, 32767);
+			s64 delta1 = std::clamp<s64>(x1[1] - x0[1], -32768, 32767);
+
+			return std::array<s16, 2>{
+				static_cast<s16>(x0[0] + fraction * delta0 / scaleFactor),
+				static_cast<s16>(x0[1] + fraction * delta1 / scaleFactor),
+			};
+		});
+	}
+
+	void polyphase(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi) {
+		linear(state, input, rate, output, outputi);
+	}
+}  // namespace Audio::Interpolation
\ No newline at end of file
diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp
index 06e001f11..593dd4777 100644
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@@ -732,6 +732,7 @@ namespace Audio {
 		rateMultiplier = 1.f;
 
 		buffers = {};
+        interpolationState = {};
 		currentSamples.clear();
 
 		gains.fill({});

From c70388dbeb1b1939caae67d6401d4caa61ea4bd0 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sat, 9 Nov 2024 23:18:41 +0200
Subject: [PATCH 05/14] HLE DSP: Actually interpolate audio

---
 src/core/audio/hle_core.cpp | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp
index 593dd4777..96e51aaed 100644
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@@ -488,7 +488,7 @@ namespace Audio {
 			decodeBuffer(source);
 		} else {
 			uint maxSampleCount = uint(float(Audio::samplesInFrame) * 1.0);
-			uint outputCount = 0;
+			usize outputCount = 0;
 
 			while (outputCount < maxSampleCount) {
 				if (source.currentSamples.empty()) {
@@ -499,19 +499,27 @@ namespace Audio {
 					}
 				}
 
-				const uint sampleCount = std::min<s32>(maxSampleCount - outputCount, source.currentSamples.size());
+				switch (source.interpolationMode) {
+					case Source::InterpolationMode::Linear:
+						Audio::Interpolation::linear(
+							source.interpolationState, source.currentSamples, source.rateMultiplier, source.currentFrame, outputCount
+						);
+						break;
+					case Source::InterpolationMode::None:
+						Audio::Interpolation::none(
+							source.interpolationState, source.currentSamples, source.rateMultiplier, source.currentFrame, outputCount
+						);
+						break;
 
-				// Copy samples to current frame buffer
-				// TODO: Implement linear/polyphase interpolation
+					case Source::InterpolationMode::Polyphase:
+						// Currently stubbed to be the same as linear
+						Audio::Interpolation::polyphase(
+							source.interpolationState, source.currentSamples, source.rateMultiplier, source.currentFrame, outputCount
+						);
+						break;
+				}
 
-				std::copy(
-					source.currentSamples.begin(), std::next(source.currentSamples.begin(), sampleCount), source.currentFrame.begin() + outputCount
-				);
-				// Remove samples from sample buffer
-				source.currentSamples.erase(source.currentSamples.begin(), std::next(source.currentSamples.begin(), sampleCount));
-				// Advance sample position
-				source.samplePosition += sampleCount;
-				outputCount += sampleCount;
+				source.samplePosition += u32(outputCount);
 			}
 		}
 	}

From 6a793097226eb4ad0c2ad856c0cfe343b1adf7c2 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sat, 9 Nov 2024 23:52:29 +0200
Subject: [PATCH 06/14] HLE DSP: Fix up resampling a bit

---
 src/core/audio/hle_core.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp
index 96e51aaed..77ed719f7 100644
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@@ -487,10 +487,10 @@ namespace Audio {
 
 			decodeBuffer(source);
 		} else {
-			uint maxSampleCount = uint(float(Audio::samplesInFrame) * 1.0);
 			usize outputCount = 0;
+			static constexpr usize maxSamples = Audio::samplesInFrame;
 
-			while (outputCount < maxSampleCount) {
+			while (outputCount < maxSamples) {
 				if (source.currentSamples.empty()) {
 					if (source.buffers.empty()) {
 						break;
@@ -518,9 +518,9 @@ namespace Audio {
 						);
 						break;
 				}
-
-				source.samplePosition += u32(outputCount);
 			}
+
+			source.samplePosition += u32(outputCount * source.rateMultiplier);
 		}
 	}
 

From 7a4f3f48369009c9960c1d39d42a442e10b8e53d Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 10 Nov 2024 14:53:07 +0200
Subject: [PATCH 07/14] HLE DSP: Add passthrough mix detection

---
 include/audio/hle_core.hpp  |  4 ++++
 src/core/audio/hle_core.cpp | 31 ++++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/include/audio/hle_core.hpp b/include/audio/hle_core.hpp
index b3832d76b..eed7bcc14 100644
--- a/include/audio/hle_core.hpp
+++ b/include/audio/hle_core.hpp
@@ -61,6 +61,10 @@ namespace Audio {
 		// There's one gain configuration for each of the 3 intermediate mixing stages
 		// And each gain configuration is composed of 4 gain values, one for each sample in a quad-channel sample
 		std::array<std::array<float, 4>, 3> gains;
+		// Of the 3 intermediate mix stages, typically only the first one is actually enabled and the other ones do nothing
+		// Ie their gain is vec4(0.0). We track which stages are disabled (have a gain of all 0s) using this bitfield and skip them
+		// In order to save up on CPU time.
+		uint enabledMixStages = 0;
 
 		u32 samplePosition;  // Sample number into the current audio buffer
 		float rateMultiplier;
diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp
index 77ed719f7..be754c81a 100644
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@@ -255,6 +255,11 @@ namespace Audio {
 			// If the source is still enabled, mix its output into the intermediate mix buffers
 			if (source.enabled) {
 				for (int mix = 0; mix < mixes.size(); mix++) {
+					// Check if this stage is passthrough, and if it is, then skip it
+					if ((source.enabledMixStages & (1u << mix)) == 0) {
+						continue;
+					}
+
 					IntermediateMix& intermediateMix = mixes[mix];
 					const std::array<float, 4>& gains = source.gains[mix];
 
@@ -397,16 +402,23 @@ namespace Audio {
 			}
 		}
 
-#define CONFIG_GAIN(index)                 \
-	if (config.gain##index##Dirty) {       \
-		auto& dest = source.gains[index];  \
-		auto& source = config.gain[index]; \
-                                           \
-		dest[0] = float(source[0]);        \
-		dest[1] = float(source[1]);        \
-		dest[2] = float(source[2]);        \
-		dest[3] = float(source[3]);        \
+#define CONFIG_GAIN(index)                                                          \
+	if (config.gain##index##Dirty) {                                                \
+		auto& dest = source.gains[index];                                           \
+		auto& sourceGain = config.gain[index];                                      \
+                                                                                    \
+		dest[0] = float(sourceGain[0]);                                             \
+		dest[1] = float(sourceGain[1]);                                             \
+		dest[2] = float(sourceGain[2]);                                             \
+		dest[3] = float(sourceGain[3]);                                             \
+                                                                                    \
+		if (dest[0] == 0.f && dest[1] == 0.f && dest[2] == 0.f && dest[3] == 0.f) { \
+			source.enabledMixStages &= ~(1u << index);                              \
+		} else {                                                                    \
+			source.enabledMixStages |= (1u << index);                               \
+		}                                                                           \
 	}
+
 		CONFIG_GAIN(0);
 		CONFIG_GAIN(1);
 		CONFIG_GAIN(2);
@@ -744,5 +756,6 @@ namespace Audio {
 		currentSamples.clear();
 
 		gains.fill({});
+		enabledMixStages = 0;
 	}
 }  // namespace Audio

From e22bc580600a9a2013ca64ba0d764f52fa5e5607 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 10 Nov 2024 14:55:31 +0200
Subject: [PATCH 08/14] HLE DSP: Format

---
 src/core/audio/hle_core.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp
index be754c81a..1e7b6bfb7 100644
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@@ -483,7 +483,8 @@ namespace Audio {
 	}
 
 	void HLE_DSP::generateFrame(DSPSource& source) {
-		// Zero out all output samples at first. TODO: Don't zero out the entire frame initially, rather only zero-out the "unwritten" samples when the frame is done being processed.
+		// Zero out all output samples at first. TODO: Don't zero out the entire frame initially, rather only zero-out the "unwritten" samples when
+		// the frame is done being processed.
 		source.currentFrame = {};
 
 		if (source.currentSamples.empty()) {
@@ -556,7 +557,7 @@ namespace Audio {
 		if (config.outputFormatDirty) {
 			mixer.channelFormat = config.outputFormat;
 		}
-		
+
 		if (config.masterVolumeDirty) {
 			mixer.volumes[0] = config.masterVolume;
 		}
@@ -564,7 +565,7 @@ namespace Audio {
 		if (config.auxVolume0Dirty) {
 			mixer.volumes[1] = config.auxVolumes[0];
 		}
-		
+
 		if (config.auxVolume1Dirty) {
 			mixer.volumes[2] = config.auxVolumes[1];
 		}
@@ -726,7 +727,7 @@ namespace Audio {
 				response = request;
 				response.resultCode = AAC::ResultCode::Success;
 				break;
-				
+
 			default: Helpers::warn("Unknown AAC command type"); break;
 		}
 
@@ -752,7 +753,7 @@ namespace Audio {
 		rateMultiplier = 1.f;
 
 		buffers = {};
-        interpolationState = {};
+		interpolationState = {};
 		currentSamples.clear();
 
 		gains.fill({});

From 66be960150418f6758d4ea86f67eb037eece4f9f Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Tue, 19 Nov 2024 01:27:42 +0200
Subject: [PATCH 09/14] HLE DSP: Stub audio output

---
 src/core/audio/hle_core.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp
index 1e7b6bfb7..3ab05c93d 100644
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@@ -211,11 +211,11 @@ namespace Audio {
 
 		if (audioEnabled) {
 			// Wait until we've actually got room to push our frame
-			while (sampleBuffer.size() + 2 > sampleBuffer.Capacity()) {
+			while (sampleBuffer.size() + frame.size() * 2 > sampleBuffer.Capacity()) {
 				std::this_thread::sleep_for(std::chrono::milliseconds{1});
 			}
 
-			sampleBuffer.push(frame.data(), frame.size());
+			sampleBuffer.push(frame.data(), frame.size() * 2);
 		}
 	}
 
@@ -276,6 +276,12 @@ namespace Audio {
 			}
 		}
 
+		for (int i = 0; i < Audio::samplesInFrame; i++) {
+			auto& mix0 = mixes[0];
+			auto& sample = mix0[i];
+			frame[i] = {s16(sample[0]), s16(sample[2])};
+		}
+
 		performMix(read, write);
 	}
 

From 47ffd76faefc46f22add87bb93bce9f69f249399 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Tue, 19 Nov 2024 01:36:07 +0200
Subject: [PATCH 10/14] HLE DSP: Fix temporary quad -> stereo conversion

---
 src/core/audio/hle_core.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp
index 3ab05c93d..fec795f89 100644
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@@ -279,7 +279,7 @@ namespace Audio {
 		for (int i = 0; i < Audio::samplesInFrame; i++) {
 			auto& mix0 = mixes[0];
 			auto& sample = mix0[i];
-			frame[i] = {s16(sample[0]), s16(sample[2])};
+			frame[i] = {s16(sample[0]), s16(sample[1])};
 		}
 
 		performMix(read, write);

From efb6cdd30d462be35dcf98b8a8d2c3976ba772f3 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Tue, 19 Nov 2024 02:12:20 +0200
Subject: [PATCH 11/14] Run clang-format

---
 include/audio/hle_core.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/audio/hle_core.hpp b/include/audio/hle_core.hpp
index eed7bcc14..5868a5d00 100644
--- a/include/audio/hle_core.hpp
+++ b/include/audio/hle_core.hpp
@@ -48,7 +48,7 @@ namespace Audio {
 		using SampleBuffer = std::deque<std::array<s16, 2>>;
 		using BufferQueue = std::priority_queue<Buffer>;
 		using InterpolationMode = HLE::SourceConfiguration::Configuration::InterpolationMode;
-        using InterpolationState = Audio::Interpolation::State;
+		using InterpolationState = Audio::Interpolation::State;
 
 		DSPMixer::StereoFrame<s16> currentFrame;
 		BufferQueue buffers;
@@ -56,7 +56,7 @@ namespace Audio {
 		SampleFormat sampleFormat = SampleFormat::ADPCM;
 		SourceType sourceType = SourceType::Stereo;
 		InterpolationMode interpolationMode = InterpolationMode::Linear;
-        InterpolationState interpolationState;
+		InterpolationState interpolationState;
 
 		// There's one gain configuration for each of the 3 intermediate mixing stages
 		// And each gain configuration is composed of 4 gain values, one for each sample in a quad-channel sample
@@ -221,5 +221,4 @@ namespace Audio {
 		void setSemaphore(u16 value) override {}
 		void setSemaphoreMask(u16 value) override {}
 	};
-
 }  // namespace Audio

From 878ff419fd87752c7f51f8ce8ae76e893edee437 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Wed, 20 Nov 2024 17:52:42 +0200
Subject: [PATCH 12/14] DSP: Add SSE quad-conversion code

Co-Authored-By: Kelpsy <138107494+kelpsyberry@users.noreply.github.com>
---
 CMakeLists.txt              |  2 +-
 include/audio/dsp_simd.hpp  | 62 +++++++++++++++++++++++++++++++++++++
 include/audio/hle_core.hpp  |  7 +++--
 src/core/audio/hle_core.cpp | 15 +++------
 4 files changed, 73 insertions(+), 13 deletions(-)
 create mode 100644 include/audio/dsp_simd.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f780387d2..c5e76d5a0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -355,7 +355,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                  include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
                  include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp
                  include/align.hpp include/audio/aac_decoder.hpp include/PICA/pica_simd.hpp include/services/fonts.hpp
-                 include/audio/audio_interpolation.hpp include/audio/hle_mixer.hpp
+                 include/audio/audio_interpolation.hpp include/audio/hle_mixer.hpp include/audio/dsp_simd.hpp
 )
 
 cmrc_add_resource_library(
diff --git a/include/audio/dsp_simd.hpp b/include/audio/dsp_simd.hpp
new file mode 100644
index 000000000..488234850
--- /dev/null
+++ b/include/audio/dsp_simd.hpp
@@ -0,0 +1,62 @@
+#pragma once
+
+#include "audio/hle_mixer.hpp"
+#include "compiler_builtins.hpp"
+#include "helpers.hpp"
+
+#if defined(_M_AMD64) || defined(__x86_64__)
+#define DSP_SIMD_X64
+#include <immintrin.h>
+#elif defined(_M_ARM64) || defined(__aarch64__)
+#define DSP_SIMD_ARM64
+#include <arm_neon.h>
+#endif
+
+// Optimized SIMD functions for mixing the stereo output of a DSP voice into a quadraphonic intermediate mix
+namespace DSP::MixIntoQuad {
+	using IntermediateMix = Audio::DSPMixer::IntermediateMix;
+	using StereoFrame16 = Audio::DSPMixer::StereoFrame<s16>;
+
+	// Non-SIMD, portable algorithm
+	ALWAYS_INLINE static void mixPortable(IntermediateMix& mix, StereoFrame16& frame, const float* gains) {
+		for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) {
+			// Mono samples are in the format: (l, r)
+			// When converting to quad, gain0 and gain2 are applied to the left sample, gain1 and gain3 to the right one
+			mix[sampleIndex][0] += s32(frame[sampleIndex][0] * gains[0]);
+			mix[sampleIndex][1] += s32(frame[sampleIndex][1] * gains[1]);
+			mix[sampleIndex][2] += s32(frame[sampleIndex][0] * gains[2]);
+			mix[sampleIndex][3] += s32(frame[sampleIndex][1] * gains[3]);
+		}
+	}
+
+#if defined(DSP_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
+	ALWAYS_INLINE static void mixSSE4_1(IntermediateMix& mix, StereoFrame16& frame, const float* gains) {
+		for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) {
+			// The stereo samples, repeated every 4 bytes inside the vector register
+			__m128i stereoSamples = _mm_castps_si128(_mm_load1_ps((float*)&frame[sampleIndex][0]));
+
+			__m128 currentFrame = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(stereoSamples));
+			__m128 gains_ = _mm_load_ps(gains);
+			__m128i offset = _mm_cvtps_epi32(_mm_mul_ps(currentFrame, gains_));
+			__m128i intermediateMixPrev = _mm_load_si128((__m128i*)&mix[sampleIndex][0]);
+			__m128i result = _mm_add_epi32(intermediateMixPrev, offset);
+			_mm_store_si128((__m128i*)&mix[sampleIndex][0], result);
+		}
+	}
+#endif
+
+#ifdef DSP_SIMD_ARM64
+	ALWAYS_INLINE static void mixNEON(IntermediateMix& mix, StereoFrame16& frame, const float* gains) { mixPortable(mix, frame, gains); }
+#endif
+
+	// Mixes the stereo output of a DSP voice into a quadraphonic intermediate mix
+	static void mix(IntermediateMix& mix, StereoFrame16& frame, const float* gains) {
+#if defined(DSP_SIMD_ARM64)
+		return mixNEON(mix, frame, gains);
+#elif defined(DSP_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
+		return mixSSE4_1(mix, frame, gains);
+#else
+		return mixPortable(mix, frame, gains);
+#endif
+	}
+}  // namespace DSP::MixIntoQuad
\ No newline at end of file
diff --git a/include/audio/hle_core.hpp b/include/audio/hle_core.hpp
index 5868a5d00..32bbaae8a 100644
--- a/include/audio/hle_core.hpp
+++ b/include/audio/hle_core.hpp
@@ -50,7 +50,9 @@ namespace Audio {
 		using InterpolationMode = HLE::SourceConfiguration::Configuration::InterpolationMode;
 		using InterpolationState = Audio::Interpolation::State;
 
-		DSPMixer::StereoFrame<s16> currentFrame;
+		// The samples this voice output for this audio frame.
+		// Aligned to 4 for SIMD purposes.
+		alignas(4) DSPMixer::StereoFrame<s16> currentFrame;
 		BufferQueue buffers;
 
 		SampleFormat sampleFormat = SampleFormat::ADPCM;
@@ -60,7 +62,8 @@ namespace Audio {
 
 		// There's one gain configuration for each of the 3 intermediate mixing stages
 		// And each gain configuration is composed of 4 gain values, one for each sample in a quad-channel sample
-		std::array<std::array<float, 4>, 3> gains;
+		// Aligned to 16 for SIMD purposes
+		alignas(16) std::array<std::array<float, 4>, 3> gains;
 		// Of the 3 intermediate mix stages, typically only the first one is actually enabled and the other ones do nothing
 		// Ie their gain is vec4(0.0). We track which stages are disabled (have a gain of all 0s) using this bitfield and skip them
 		// In order to save up on CPU time.
diff --git a/src/core/audio/hle_core.cpp b/src/core/audio/hle_core.cpp
index fec795f89..7e82a1398 100644
--- a/src/core/audio/hle_core.cpp
+++ b/src/core/audio/hle_core.cpp
@@ -7,6 +7,7 @@
 #include <utility>
 
 #include "audio/aac_decoder.hpp"
+#include "audio/dsp_simd.hpp"
 #include "services/dsp.hpp"
 
 namespace Audio {
@@ -228,7 +229,9 @@ namespace Audio {
 		// The DSP checks the DSP configuration dirty bits on every frame, applies them, and clears them
 		read.dspConfiguration.dirtyRaw = 0;
 		read.dspConfiguration.dirtyRaw2 = 0;
-		std::array<IntermediateMix, 3> mixes{};
+
+		// The intermediate mix buffer is aligned to 16 for SIMD purposes
+		alignas(16) std::array<IntermediateMix, 3> mixes{};
 
 		for (int i = 0; i < sourceCount; i++) {
 			// Update source configuration from the read region of shared memory
@@ -263,15 +266,7 @@ namespace Audio {
 					IntermediateMix& intermediateMix = mixes[mix];
 					const std::array<float, 4>& gains = source.gains[mix];
 
-					// TODO: SIMD implementations
-					for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) {
-						// Mono samples are in the format: (l, r)
-						// When converting to quad, gain0 and gain2 are applied to the left sample, gain1 and gain3 to the right one
-						intermediateMix[sampleIndex][0] += s32(source.currentFrame[sampleIndex][0] * gains[0]);
-						intermediateMix[sampleIndex][1] += s32(source.currentFrame[sampleIndex][1] * gains[1]);
-						intermediateMix[sampleIndex][2] += s32(source.currentFrame[sampleIndex][0] * gains[2]);
-						intermediateMix[sampleIndex][3] += s32(source.currentFrame[sampleIndex][1] * gains[3]);
-					}
+					DSP::MixIntoQuad::mix(intermediateMix, source.currentFrame, gains.data());
 				}
 			}
 		}

From f30eed79801d790ad9186e075b4820243a3ec0b0 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Wed, 20 Nov 2024 20:01:00 +0200
Subject: [PATCH 13/14] DSP: Add NEON quad-conversion code

Co-Authored-By: Kelpsy <138107494+kelpsyberry@users.noreply.github.com>
---
 include/audio/dsp_simd.hpp | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/include/audio/dsp_simd.hpp b/include/audio/dsp_simd.hpp
index 488234850..92cb486fc 100644
--- a/include/audio/dsp_simd.hpp
+++ b/include/audio/dsp_simd.hpp
@@ -31,13 +31,14 @@ namespace DSP::MixIntoQuad {
 
 #if defined(DSP_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
 	ALWAYS_INLINE static void mixSSE4_1(IntermediateMix& mix, StereoFrame16& frame, const float* gains) {
+		__m128 gains_ = _mm_load_ps(gains);
+
 		for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) {
 			// The stereo samples, repeated every 4 bytes inside the vector register
 			__m128i stereoSamples = _mm_castps_si128(_mm_load1_ps((float*)&frame[sampleIndex][0]));
 
 			__m128 currentFrame = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(stereoSamples));
-			__m128 gains_ = _mm_load_ps(gains);
-			__m128i offset = _mm_cvtps_epi32(_mm_mul_ps(currentFrame, gains_));
+			__m128i offset = _mm_cvttps_epi32(_mm_mul_ps(currentFrame, gains_));
 			__m128i intermediateMixPrev = _mm_load_si128((__m128i*)&mix[sampleIndex][0]);
 			__m128i result = _mm_add_epi32(intermediateMixPrev, offset);
 			_mm_store_si128((__m128i*)&mix[sampleIndex][0], result);
@@ -46,7 +47,22 @@ namespace DSP::MixIntoQuad {
 #endif
 
 #ifdef DSP_SIMD_ARM64
-	ALWAYS_INLINE static void mixNEON(IntermediateMix& mix, StereoFrame16& frame, const float* gains) { mixPortable(mix, frame, gains); }
+	ALWAYS_INLINE static void mixNEON(IntermediateMix& mix, StereoFrame16& frame, const float* gains) {
+		float32x4_t gains_ = vld1q_f32(gains);
+
+		for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) {
+			// Load l and r samples and repeat them every 4 bytes
+			int32x4_t stereoSamples = vld1q_dup_s32((s32*)&frame[sampleIndex][0]);
+			// Expand the bottom 4 s16 samples into an int32x4 with sign extension, then convert them to float32x4
+			float32x4_t currentFrame = vcvtq_f32_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(stereoSamples))));
+
+			// Multiply samples by their respective gains, truncate the result, and add it into the intermediate mix buffer
+			int32x4_t offset = vcvtq_s32_f32(vmulq_f32(currentFrame, gains_));
+			int32x4_t intermediateMixPrev = vld1q_s32((s32*)&mix[sampleIndex][0]);
+			int32x4_t result = vaddq_f32(intermediateMixPrev, offset);
+			vst1q_s32((s32*)&mix[sampleIndex][0], result);
+		}
+	}
 #endif
 
 	// Mixes the stereo output of a DSP voice into a quadraphonic intermediate mix

From b78450c88d4a4df15937c8f6584070257e7ec69c Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Wed, 20 Nov 2024 21:17:08 +0200
Subject: [PATCH 14/14] NEON mixer: Change vaddq_f32 to vaddq_s32 (Thank you
 Clang)

Co-Authored-By: Kelpsy <138107494+kelpsyberry@users.noreply.github.com>
---
 include/audio/dsp_simd.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/audio/dsp_simd.hpp b/include/audio/dsp_simd.hpp
index 92cb486fc..9a0e723a4 100644
--- a/include/audio/dsp_simd.hpp
+++ b/include/audio/dsp_simd.hpp
@@ -59,7 +59,7 @@ namespace DSP::MixIntoQuad {
 			// Multiply samples by their respective gains, truncate the result, and add it into the intermediate mix buffer
 			int32x4_t offset = vcvtq_s32_f32(vmulq_f32(currentFrame, gains_));
 			int32x4_t intermediateMixPrev = vld1q_s32((s32*)&mix[sampleIndex][0]);
-			int32x4_t result = vaddq_f32(intermediateMixPrev, offset);
+			int32x4_t result = vaddq_s32(intermediateMixPrev, offset);
 			vst1q_s32((s32*)&mix[sampleIndex][0], result);
 		}
 	}