From b74e566548da036dcd6d48b6d4371ed93a4844c9 Mon Sep 17 00:00:00 2001 From: Ziemas Date: Tue, 17 Oct 2023 04:44:35 +0200 Subject: [PATCH 1/5] SPU2: Simplify reverb resampling --- pcsx2/SPU2/Reverb.cpp | 49 +++++++++++++++++++------------------------ pcsx2/SPU2/defs.h | 6 +++--- 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/pcsx2/SPU2/Reverb.cpp b/pcsx2/SPU2/Reverb.cpp index 2f007df04d9fa..5718706711300 100644 --- a/pcsx2/SPU2/Reverb.cpp +++ b/pcsx2/SPU2/Reverb.cpp @@ -99,47 +99,34 @@ static constexpr std::array filter_coefs = { s32 __forceinline V_Core::ReverbDownsample(bool right) { + int index = (RevbSampleBufPos - NUM_TAPS) & 63; s32 out = 0; - // Skipping the 0 coefs. - for (u32 i = 0; i < NUM_TAPS; i += 2) + for (int i = 0; i < NUM_TAPS; i++) { - out += RevbDownBuf[right][((RevbSampleBufPos - NUM_TAPS) + i) & 63] * filter_coefs[i]; + out += RevbDownBuf[right][index + i] * filter_coefs[i]; } - // We also skipped the middle so add that in. - out += RevbDownBuf[right][((RevbSampleBufPos - NUM_TAPS) + 19) & 63] * filter_coefs[19]; - out >>= 15; out = std::clamp(out, INT16_MIN, INT16_MAX); return out; } -StereoOut32 __forceinline V_Core::ReverbUpsample(bool phase) +StereoOut32 __forceinline V_Core::ReverbUpsample() { + int index = (RevbSampleBufPos - NUM_TAPS) & 63; s32 ls = 0, rs = 0; - if (phase) - { - ls += RevbUpBuf[0][(((RevbSampleBufPos - NUM_TAPS) >> 1) + 9) & 63] * filter_coefs[19]; - rs += RevbUpBuf[1][(((RevbSampleBufPos - NUM_TAPS) >> 1) + 9) & 63] * filter_coefs[19]; - } - else + for (int i = 0; i < NUM_TAPS; i++) { - for (u32 i = 0; i < (NUM_TAPS >> 1) + 1; i++) - { - ls += RevbUpBuf[0][(((RevbSampleBufPos - NUM_TAPS) >> 1) + i) & 63] * filter_coefs[i * 2]; - } - for (u32 i = 0; i < (NUM_TAPS >> 1) + 1; i++) - { - rs += RevbUpBuf[1][(((RevbSampleBufPos - NUM_TAPS) >> 1) + i) & 63] * filter_coefs[i * 2]; - } + ls += RevbUpBuf[0][index + i] * (filter_coefs[i] * 2); + rs += RevbUpBuf[1][index + i] * (filter_coefs[i] * 2); } - ls >>= 14; + ls >>= 15; ls = std::clamp(ls, INT16_MIN, INT16_MAX); - rs >>= 14; + rs >>= 15; rs = std::clamp(rs, INT16_MIN, INT16_MAX); return {ls, rs}; @@ -164,8 +151,10 @@ StereoOut32 V_Core::DoReverb(const StereoOut32& Input) return StereoOut32::Empty; } - RevbDownBuf[0][RevbSampleBufPos & 63] = Input.Left; - RevbDownBuf[1][RevbSampleBufPos & 63] = Input.Right; + RevbDownBuf[0][RevbSampleBufPos] = Input.Left; + RevbDownBuf[1][RevbSampleBufPos] = Input.Right; + RevbDownBuf[0][RevbSampleBufPos + 64] = Input.Left; + RevbDownBuf[1][RevbSampleBufPos + 64] = Input.Right; bool R = Cycles & 1; @@ -245,9 +234,13 @@ StereoOut32 V_Core::DoReverb(const StereoOut32& Input) _spu2mem[apf2_dst] = clamp_mix(apf2); } - RevbUpBuf[R][(RevbSampleBufPos >> 1) & 63] = clamp_mix(out); + RevbUpBuf[R][RevbSampleBufPos] = clamp_mix(out); + RevbUpBuf[!R][RevbSampleBufPos] = 0; + + RevbUpBuf[R][RevbSampleBufPos + 64] = clamp_mix(out); + RevbUpBuf[!R][RevbSampleBufPos + 64] = 0; - RevbSampleBufPos++; + RevbSampleBufPos = (RevbSampleBufPos + 1) & 63; - return ReverbUpsample(RevbSampleBufPos & 1); + return ReverbUpsample(); } diff --git a/pcsx2/SPU2/defs.h b/pcsx2/SPU2/defs.h index 0e06a70826fc9..6733ad0ac058e 100644 --- a/pcsx2/SPU2/defs.h +++ b/pcsx2/SPU2/defs.h @@ -422,8 +422,8 @@ struct V_Core V_Reverb Revb; // Reverb Registers - s32 RevbDownBuf[2][64]; // Downsample buffer for reverb, one for each channel - s32 RevbUpBuf[2][64]; // Upsample buffer for reverb, one for each channel + s32 RevbDownBuf[2][64 * 2]; // Downsample buffer for reverb, one for each channel + s32 RevbUpBuf[2][64 * 2]; // Upsample buffer for reverb, one for each channel u32 RevbSampleBufPos; u32 EffectsStartA; u32 EffectsEndA; @@ -487,7 +487,7 @@ struct V_Core s32 RevbGetIndexer(s32 offset); s32 ReverbDownsample(bool right); - StereoOut32 ReverbUpsample(bool phase); + StereoOut32 ReverbUpsample(); StereoOut32 ReadInput(); StereoOut32 ReadInput_HiFi(); From b5a305793bf54704f403f4039d71ee0703e0bce3 Mon Sep 17 00:00:00 2001 From: Ziemas Date: Tue, 17 Oct 2023 09:41:25 +0200 Subject: [PATCH 2/5] SPU2: Optimize reverb resampling [SAVEVERSION+] --- pcsx2/GS/GSVector4i.h | 5 ++ pcsx2/GS/GSVector8i.h | 5 ++ pcsx2/SPU2/Reverb.cpp | 172 +++++++++++++++++++++++++++++++++++------- pcsx2/SPU2/defs.h | 4 +- pcsx2/SaveState.h | 2 +- 5 files changed, 156 insertions(+), 32 deletions(-) diff --git a/pcsx2/GS/GSVector4i.h b/pcsx2/GS/GSVector4i.h index c34e62365f473..4c9a120852e80 100644 --- a/pcsx2/GS/GSVector4i.h +++ b/pcsx2/GS/GSVector4i.h @@ -846,6 +846,11 @@ class alignas(16) GSVector4i return GSVector4i(_mm_adds_epi16(m, v.m)); } + __forceinline GSVector4i hadds16(const GSVector4i& v) const + { + return GSVector4i(_mm_hadds_epi16(m, v.m)); + } + __forceinline GSVector4i addus8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu8(m, v.m)); diff --git a/pcsx2/GS/GSVector8i.h b/pcsx2/GS/GSVector8i.h index 8bff2eef42845..2cd2032f1385d 100644 --- a/pcsx2/GS/GSVector8i.h +++ b/pcsx2/GS/GSVector8i.h @@ -765,6 +765,11 @@ class alignas(32) GSVector8i return GSVector8i(_mm256_adds_epi16(m, v.m)); } + __forceinline GSVector8i hadds16(const GSVector8i& v) const + { + return GSVector8i(_mm256_hadds_epi16(m, v.m)); + } + __forceinline GSVector8i addus8(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epu8(m, v.m)); diff --git a/pcsx2/SPU2/Reverb.cpp b/pcsx2/SPU2/Reverb.cpp index 5718706711300..f1369e9903d7d 100644 --- a/pcsx2/SPU2/Reverb.cpp +++ b/pcsx2/SPU2/Reverb.cpp @@ -15,6 +15,8 @@ #include "PrecompiledHeader.h" #include "Global.h" +#include "GS/GSVector.h" + #include @@ -55,7 +57,7 @@ void V_Core::AnalyzeReverbPreset() static constexpr u32 NUM_TAPS = 39; // 39 tap filter, the 0's could be optimized out -static constexpr std::array filter_coefs = { +static constexpr std::array filter_down_coefs alignas(32) = { -1, 0, 2, @@ -97,39 +99,147 @@ static constexpr std::array filter_coefs = { -1, }; -s32 __forceinline V_Core::ReverbDownsample(bool right) +static constexpr std::array make_up_coefs() { - int index = (RevbSampleBufPos - NUM_TAPS) & 63; - s32 out = 0; + std::array ret = {}; - for (int i = 0; i < NUM_TAPS; i++) + for (u32 i = 0; i < NUM_TAPS; i++) { - out += RevbDownBuf[right][index + i] * filter_coefs[i]; + ret[i] = static_cast(std::clamp(filter_down_coefs[i] * 2, INT16_MIN, INT16_MAX)); } - out >>= 15; - out = std::clamp(out, INT16_MIN, INT16_MAX); - - return out; + return ret; } -StereoOut32 __forceinline V_Core::ReverbUpsample() +static constexpr std::array filter_up_coefs alignas(32) = make_up_coefs(); + +s32 __forceinline V_Core::ReverbDownsample(bool right) { int index = (RevbSampleBufPos - NUM_TAPS) & 63; - s32 ls = 0, rs = 0; - for (int i = 0; i < NUM_TAPS; i++) - { - ls += RevbUpBuf[0][index + i] * (filter_coefs[i] * 2); - rs += RevbUpBuf[1][index + i] * (filter_coefs[i] * 2); - } +#if _M_SSE >= 0x501 + auto c = GSVector8i::load(&filter_down_coefs[0]); + auto s = GSVector8i::load(&RevbDownBuf[right][index]); + auto acc = s.mul16hrs(c); + + c = GSVector8i::load(&filter_down_coefs[16]); + s = GSVector8i::load(&RevbDownBuf[right][index + 16]); + acc = acc.adds16(s.mul16hrs(c)); + + c = GSVector8i::load(&filter_down_coefs[32]); + s = GSVector8i::load(&RevbDownBuf[right][index + 32]); + acc = acc.adds16(s.mul16hrs(c)); + + acc = acc.adds16(acc.ba()); + + acc = acc.hadds16(acc); + acc = acc.hadds16(acc); + acc = acc.hadds16(acc); +#else + auto c = GSVector4i::load(&filter_down_coefs[0]); + auto s = GSVector4i::load(&RevbDownBuf[right][index]); + auto acc = s.mul16hrs(c); - ls >>= 15; - ls = std::clamp(ls, INT16_MIN, INT16_MAX); - rs >>= 15; - rs = std::clamp(rs, INT16_MIN, INT16_MAX); + c = GSVector4i::load(&filter_down_coefs[8]); + s = GSVector4i::load(&RevbDownBuf[right][index + 8]); + acc = acc.adds16(s.mul16hrs(c)); - return {ls, rs}; + c = GSVector4i::load(&filter_down_coefs[16]); + s = GSVector4i::load(&RevbDownBuf[right][index + 16]); + acc = acc.adds16(s.mul16hrs(c)); + + c = GSVector4i::load(&filter_down_coefs[24]); + s = GSVector4i::load(&RevbDownBuf[right][index + 24]); + acc = acc.adds16(s.mul16hrs(c)); + + c = GSVector4i::load(&filter_down_coefs[32]); + s = GSVector4i::load(&RevbDownBuf[right][index + 32]); + acc = acc.adds16(s.mul16hrs(c)); + + acc = acc.hadds16(acc); + acc = acc.hadds16(acc); + acc = acc.hadds16(acc); +#endif + + return acc.I16[0]; +} + +StereoOut32 __forceinline V_Core::ReverbUpsample() +{ + int index = (RevbSampleBufPos - NUM_TAPS) & 63; + +#if _M_SSE >= 0x501 + auto c = GSVector8i::load(&filter_up_coefs[0]); + auto l = GSVector8i::load(&RevbUpBuf[0][index]); + auto r = GSVector8i::load(&RevbUpBuf[1][index]); + + auto lacc = l.mul16hrs(c); + auto racc = r.mul16hrs(c); + + c = GSVector8i::load(&filter_up_coefs[16]); + l = GSVector8i::load(&RevbUpBuf[0][index + 16]); + r = GSVector8i::load(&RevbUpBuf[1][index + 16]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + c = GSVector8i::load(&filter_up_coefs[32]); + l = GSVector8i::load(&RevbUpBuf[0][index + 32]); + r = GSVector8i::load(&RevbUpBuf[1][index + 32]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + lacc = lacc.adds16(lacc.ba()); + racc = racc.adds16(racc.ba()); + + lacc = lacc.hadds16(lacc); + lacc = lacc.hadds16(lacc); + lacc = lacc.hadds16(lacc); + + racc = racc.hadds16(racc); + racc = racc.hadds16(racc); + racc = racc.hadds16(racc); +#else + auto c = GSVector4i::load(&filter_up_coefs[0]); + auto l = GSVector4i::load(&RevbUpBuf[0][index]); + auto r = GSVector4i::load(&RevbUpBuf[1][index]); + + auto lacc = l.mul16hrs(c); + auto racc = r.mul16hrs(c); + + c = GSVector4i::load(&filter_up_coefs[8]); + l = GSVector4i::load(&RevbUpBuf[0][index + 8]); + r = GSVector4i::load(&RevbUpBuf[1][index + 8]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + c = GSVector4i::load(&filter_up_coefs[16]); + l = GSVector4i::load(&RevbUpBuf[0][index + 16]); + r = GSVector4i::load(&RevbUpBuf[1][index + 16]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + c = GSVector4i::load(&filter_up_coefs[24]); + l = GSVector4i::load(&RevbUpBuf[0][index + 24]); + r = GSVector4i::load(&RevbUpBuf[1][index + 24]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + c = GSVector4i::load(&filter_up_coefs[32]); + l = GSVector4i::load(&RevbUpBuf[0][index + 32]); + r = GSVector4i::load(&RevbUpBuf[1][index + 32]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + lacc = lacc.hadds16(lacc); + lacc = lacc.hadds16(lacc); + lacc = lacc.hadds16(lacc); + + racc = racc.hadds16(racc); + racc = racc.hadds16(racc); + racc = racc.hadds16(racc); +#endif + + return {lacc.I16[0], racc.I16[0]}; } __forceinline s32 V_Core::RevbGetIndexer(s32 offset) @@ -151,10 +261,12 @@ StereoOut32 V_Core::DoReverb(const StereoOut32& Input) return StereoOut32::Empty; } - RevbDownBuf[0][RevbSampleBufPos] = Input.Left; - RevbDownBuf[1][RevbSampleBufPos] = Input.Right; - RevbDownBuf[0][RevbSampleBufPos + 64] = Input.Left; - RevbDownBuf[1][RevbSampleBufPos + 64] = Input.Right; + auto input = clamp_mix(Input); + + RevbDownBuf[0][RevbSampleBufPos] = input.Left; + RevbDownBuf[1][RevbSampleBufPos] = input.Right; + RevbDownBuf[0][RevbSampleBufPos | 64] = input.Left; + RevbDownBuf[1][RevbSampleBufPos | 64] = input.Right; bool R = Cycles & 1; @@ -234,11 +346,13 @@ StereoOut32 V_Core::DoReverb(const StereoOut32& Input) _spu2mem[apf2_dst] = clamp_mix(apf2); } - RevbUpBuf[R][RevbSampleBufPos] = clamp_mix(out); + out = clamp_mix(out); + + RevbUpBuf[R][RevbSampleBufPos] = out; RevbUpBuf[!R][RevbSampleBufPos] = 0; - RevbUpBuf[R][RevbSampleBufPos + 64] = clamp_mix(out); - RevbUpBuf[!R][RevbSampleBufPos + 64] = 0; + RevbUpBuf[R][RevbSampleBufPos | 64] = out; + RevbUpBuf[!R][RevbSampleBufPos | 64] = 0; RevbSampleBufPos = (RevbSampleBufPos + 1) & 63; diff --git a/pcsx2/SPU2/defs.h b/pcsx2/SPU2/defs.h index 6733ad0ac058e..dfd6845b47b77 100644 --- a/pcsx2/SPU2/defs.h +++ b/pcsx2/SPU2/defs.h @@ -422,8 +422,8 @@ struct V_Core V_Reverb Revb; // Reverb Registers - s32 RevbDownBuf[2][64 * 2]; // Downsample buffer for reverb, one for each channel - s32 RevbUpBuf[2][64 * 2]; // Upsample buffer for reverb, one for each channel + s16 RevbDownBuf[2][64 * 2]; // Downsample buffer for reverb, one for each channel + s16 RevbUpBuf[2][64 * 2]; // Upsample buffer for reverb, one for each channel u32 RevbSampleBufPos; u32 EffectsStartA; u32 EffectsEndA; diff --git a/pcsx2/SaveState.h b/pcsx2/SaveState.h index bc79047f38067..6b5c804c9e407 100644 --- a/pcsx2/SaveState.h +++ b/pcsx2/SaveState.h @@ -37,7 +37,7 @@ enum class FreezeAction // [SAVEVERSION+] // This informs the auto updater that the users savestates will be invalidated. -static const u32 g_SaveVersion = (0x9A47 << 16) | 0x0000; +static const u32 g_SaveVersion = (0x9A48 << 16) | 0x0000; // the freezing data between submodules and core From 4b237594ebee2c3efe9ca011ded4e85444be2186 Mon Sep 17 00:00:00 2001 From: Ziemas Date: Tue, 17 Oct 2023 16:01:12 +0200 Subject: [PATCH 3/5] SPU2: Multi-isa resampling functions --- pcsx2/CMakeLists.txt | 7 +- pcsx2/SPU2/Reverb.cpp | 192 +------------------------ pcsx2/SPU2/ReverbResample.cpp | 257 ++++++++++++++++++++++++++++++++++ pcsx2/SPU2/defs.h | 13 +- pcsx2/SPU2/spu2sys.cpp | 6 + pcsx2/pcsx2.vcxproj | 3 +- 6 files changed, 283 insertions(+), 195 deletions(-) create mode 100644 pcsx2/SPU2/ReverbResample.cpp diff --git a/pcsx2/CMakeLists.txt b/pcsx2/CMakeLists.txt index b782c094bc6e7..f5d3f93ea5a09 100644 --- a/pcsx2/CMakeLists.txt +++ b/pcsx2/CMakeLists.txt @@ -280,6 +280,10 @@ set(pcsx2SPU2Sources SPU2/Wavedump_wav.cpp ) +set(pcsx2SPU2SourcesUnshared + SPU2/ReverbResample.cpp +) + # SPU2 headers set(pcsx2SPU2Headers SPU2/Debug.h @@ -762,7 +766,7 @@ if(DISABLE_ADVANCE_SIMD) # Note: ld64 (macOS's linker) does not act the same way when presented with .a files, unless linked with `-force_load` (cmake WHOLE_ARCHIVE). set(is_first_isa "1") foreach(isa "sse4" "avx" "avx2") - add_library(GS-${isa} STATIC ${pcsx2GSSourcesUnshared} ${pcsx2IPUSourcesUnshared}) + add_library(GS-${isa} STATIC ${pcsx2GSSourcesUnshared} ${pcsx2IPUSourcesUnshared} ${pcsx2SPU2SourcesUnshared}) target_link_libraries(GS-${isa} PRIVATE PCSX2_FLAGS) target_compile_definitions(GS-${isa} PRIVATE MULTI_ISA_UNSHARED_COMPILATION=isa_${isa} MULTI_ISA_IS_FIRST=${is_first_isa} ${pcsx2_defs_${isa}}) target_compile_options(GS-${isa} PRIVATE ${compile_options_${isa}}) @@ -778,6 +782,7 @@ if(DISABLE_ADVANCE_SIMD) else() list(APPEND pcsx2GSSources ${pcsx2GSSourcesUnshared}) list(APPEND pcsx2IPUSources ${pcsx2IPUSourcesUnshared}) + list(APPEND pcsx2SPU2Sources ${pcsx2SPU2SourcesUnshared}) endif() # DebugTools sources diff --git a/pcsx2/SPU2/Reverb.cpp b/pcsx2/SPU2/Reverb.cpp index f1369e9903d7d..9126d50148da1 100644 --- a/pcsx2/SPU2/Reverb.cpp +++ b/pcsx2/SPU2/Reverb.cpp @@ -19,7 +19,6 @@ #include - void V_Core::AnalyzeReverbPreset() { Console.WriteLn("Reverb Parameter Update for Core %d:", Index); @@ -55,193 +54,6 @@ void V_Core::AnalyzeReverbPreset() Console.WriteLn("----------------------------------------------------------"); } -static constexpr u32 NUM_TAPS = 39; -// 39 tap filter, the 0's could be optimized out -static constexpr std::array filter_down_coefs alignas(32) = { - -1, - 0, - 2, - 0, - -10, - 0, - 35, - 0, - -103, - 0, - 266, - 0, - -616, - 0, - 1332, - 0, - -2960, - 0, - 10246, - 16384, - 10246, - 0, - -2960, - 0, - 1332, - 0, - -616, - 0, - 266, - 0, - -103, - 0, - 35, - 0, - -10, - 0, - 2, - 0, - -1, -}; - -static constexpr std::array make_up_coefs() -{ - std::array ret = {}; - - for (u32 i = 0; i < NUM_TAPS; i++) - { - ret[i] = static_cast(std::clamp(filter_down_coefs[i] * 2, INT16_MIN, INT16_MAX)); - } - - return ret; -} - -static constexpr std::array filter_up_coefs alignas(32) = make_up_coefs(); - -s32 __forceinline V_Core::ReverbDownsample(bool right) -{ - int index = (RevbSampleBufPos - NUM_TAPS) & 63; - -#if _M_SSE >= 0x501 - auto c = GSVector8i::load(&filter_down_coefs[0]); - auto s = GSVector8i::load(&RevbDownBuf[right][index]); - auto acc = s.mul16hrs(c); - - c = GSVector8i::load(&filter_down_coefs[16]); - s = GSVector8i::load(&RevbDownBuf[right][index + 16]); - acc = acc.adds16(s.mul16hrs(c)); - - c = GSVector8i::load(&filter_down_coefs[32]); - s = GSVector8i::load(&RevbDownBuf[right][index + 32]); - acc = acc.adds16(s.mul16hrs(c)); - - acc = acc.adds16(acc.ba()); - - acc = acc.hadds16(acc); - acc = acc.hadds16(acc); - acc = acc.hadds16(acc); -#else - auto c = GSVector4i::load(&filter_down_coefs[0]); - auto s = GSVector4i::load(&RevbDownBuf[right][index]); - auto acc = s.mul16hrs(c); - - c = GSVector4i::load(&filter_down_coefs[8]); - s = GSVector4i::load(&RevbDownBuf[right][index + 8]); - acc = acc.adds16(s.mul16hrs(c)); - - c = GSVector4i::load(&filter_down_coefs[16]); - s = GSVector4i::load(&RevbDownBuf[right][index + 16]); - acc = acc.adds16(s.mul16hrs(c)); - - c = GSVector4i::load(&filter_down_coefs[24]); - s = GSVector4i::load(&RevbDownBuf[right][index + 24]); - acc = acc.adds16(s.mul16hrs(c)); - - c = GSVector4i::load(&filter_down_coefs[32]); - s = GSVector4i::load(&RevbDownBuf[right][index + 32]); - acc = acc.adds16(s.mul16hrs(c)); - - acc = acc.hadds16(acc); - acc = acc.hadds16(acc); - acc = acc.hadds16(acc); -#endif - - return acc.I16[0]; -} - -StereoOut32 __forceinline V_Core::ReverbUpsample() -{ - int index = (RevbSampleBufPos - NUM_TAPS) & 63; - -#if _M_SSE >= 0x501 - auto c = GSVector8i::load(&filter_up_coefs[0]); - auto l = GSVector8i::load(&RevbUpBuf[0][index]); - auto r = GSVector8i::load(&RevbUpBuf[1][index]); - - auto lacc = l.mul16hrs(c); - auto racc = r.mul16hrs(c); - - c = GSVector8i::load(&filter_up_coefs[16]); - l = GSVector8i::load(&RevbUpBuf[0][index + 16]); - r = GSVector8i::load(&RevbUpBuf[1][index + 16]); - lacc = lacc.adds16(l.mul16hrs(c)); - racc = racc.adds16(r.mul16hrs(c)); - - c = GSVector8i::load(&filter_up_coefs[32]); - l = GSVector8i::load(&RevbUpBuf[0][index + 32]); - r = GSVector8i::load(&RevbUpBuf[1][index + 32]); - lacc = lacc.adds16(l.mul16hrs(c)); - racc = racc.adds16(r.mul16hrs(c)); - - lacc = lacc.adds16(lacc.ba()); - racc = racc.adds16(racc.ba()); - - lacc = lacc.hadds16(lacc); - lacc = lacc.hadds16(lacc); - lacc = lacc.hadds16(lacc); - - racc = racc.hadds16(racc); - racc = racc.hadds16(racc); - racc = racc.hadds16(racc); -#else - auto c = GSVector4i::load(&filter_up_coefs[0]); - auto l = GSVector4i::load(&RevbUpBuf[0][index]); - auto r = GSVector4i::load(&RevbUpBuf[1][index]); - - auto lacc = l.mul16hrs(c); - auto racc = r.mul16hrs(c); - - c = GSVector4i::load(&filter_up_coefs[8]); - l = GSVector4i::load(&RevbUpBuf[0][index + 8]); - r = GSVector4i::load(&RevbUpBuf[1][index + 8]); - lacc = lacc.adds16(l.mul16hrs(c)); - racc = racc.adds16(r.mul16hrs(c)); - - c = GSVector4i::load(&filter_up_coefs[16]); - l = GSVector4i::load(&RevbUpBuf[0][index + 16]); - r = GSVector4i::load(&RevbUpBuf[1][index + 16]); - lacc = lacc.adds16(l.mul16hrs(c)); - racc = racc.adds16(r.mul16hrs(c)); - - c = GSVector4i::load(&filter_up_coefs[24]); - l = GSVector4i::load(&RevbUpBuf[0][index + 24]); - r = GSVector4i::load(&RevbUpBuf[1][index + 24]); - lacc = lacc.adds16(l.mul16hrs(c)); - racc = racc.adds16(r.mul16hrs(c)); - - c = GSVector4i::load(&filter_up_coefs[32]); - l = GSVector4i::load(&RevbUpBuf[0][index + 32]); - r = GSVector4i::load(&RevbUpBuf[1][index + 32]); - lacc = lacc.adds16(l.mul16hrs(c)); - racc = racc.adds16(r.mul16hrs(c)); - - lacc = lacc.hadds16(lacc); - lacc = lacc.hadds16(lacc); - lacc = lacc.hadds16(lacc); - - racc = racc.hadds16(racc); - racc = racc.hadds16(racc); - racc = racc.hadds16(racc); -#endif - - return {lacc.I16[0], racc.I16[0]}; -} - __forceinline s32 V_Core::RevbGetIndexer(s32 offset) { u32 start = EffectsStartA & 0x3f'ffff; @@ -325,7 +137,7 @@ StereoOut32 V_Core::DoReverb(const StereoOut32& Input) s32 in, same, diff, apf1, apf2, out; #define MUL(x, y) ((x) * (y) >> 15) - in = MUL(R ? Revb.IN_COEF_R : Revb.IN_COEF_L, ReverbDownsample(R)); + in = MUL(R ? Revb.IN_COEF_R : Revb.IN_COEF_L, ReverbDownsample(*this, R)); same = MUL(Revb.IIR_VOL, in + MUL(Revb.WALL_VOL, _spu2mem[same_src]) - _spu2mem[same_prv]) + _spu2mem[same_prv]; diff = MUL(Revb.IIR_VOL, in + MUL(Revb.WALL_VOL, _spu2mem[diff_src]) - _spu2mem[diff_prv]) + _spu2mem[diff_prv]; @@ -356,5 +168,5 @@ StereoOut32 V_Core::DoReverb(const StereoOut32& Input) RevbSampleBufPos = (RevbSampleBufPos + 1) & 63; - return ReverbUpsample(); + return ReverbUpsample(*this); } diff --git a/pcsx2/SPU2/ReverbResample.cpp b/pcsx2/SPU2/ReverbResample.cpp new file mode 100644 index 0000000000000..72bf93f2c2d63 --- /dev/null +++ b/pcsx2/SPU2/ReverbResample.cpp @@ -0,0 +1,257 @@ +#include "GS/GSVector.h" +#include "Global.h" + +MULTI_ISA_UNSHARED_START + +static constexpr u32 NUM_TAPS = 39; +// 39 tap filter, the 0's could be optimized out +static constexpr std::array filter_down_coefs alignas(32) = { + -1, + 0, + 2, + 0, + -10, + 0, + 35, + 0, + -103, + 0, + 266, + 0, + -616, + 0, + 1332, + 0, + -2960, + 0, + 10246, + 16384, + 10246, + 0, + -2960, + 0, + 1332, + 0, + -616, + 0, + 266, + 0, + -103, + 0, + 35, + 0, + -10, + 0, + 2, + 0, + -1, +}; + +static constexpr std::array make_up_coefs() +{ + std::array ret = {}; + + for (u32 i = 0; i < NUM_TAPS; i++) + { + ret[i] = static_cast(std::clamp(filter_down_coefs[i] * 2, INT16_MIN, INT16_MAX)); + } + + return ret; +} + +static constexpr std::array filter_up_coefs alignas(32) = make_up_coefs(); + +s32 __forceinline ReverbDownsample_reference(V_Core& core, bool right) +{ + int index = (core.RevbSampleBufPos - NUM_TAPS) & 63; + s32 out = 0; + + for (int i = 0; i < NUM_TAPS; i++) + { + out += core.RevbDownBuf[right][index + i] * filter_down_coefs[i]; + } + + out >>= 15; + + return clamp_mix(out); +} + +#if _M_SSE >= 0x501 +s32 __forceinline ReverbDownsample_avx(V_Core& core, bool right) +{ + int index = (core.RevbSampleBufPos - NUM_TAPS) & 63; + + auto c = GSVector8i::load(&filter_down_coefs[0]); + auto s = GSVector8i::load(&core.RevbDownBuf[right][index]); + auto acc = s.mul16hrs(c); + + c = GSVector8i::load(&filter_down_coefs[16]); + s = GSVector8i::load(&core.RevbDownBuf[right][index + 16]); + acc = acc.adds16(s.mul16hrs(c)); + + c = GSVector8i::load(&filter_down_coefs[32]); + s = GSVector8i::load(&core.RevbDownBuf[right][index + 32]); + acc = acc.adds16(s.mul16hrs(c)); + + acc = acc.adds16(acc.ba()); + + acc = acc.hadds16(acc); + acc = acc.hadds16(acc); + acc = acc.hadds16(acc); + + return acc.I16[0]; +} +#endif + +s32 __forceinline ReverbDownsample_sse(V_Core& core, bool right) +{ + int index = (core.RevbSampleBufPos - NUM_TAPS) & 63; + + auto c = GSVector4i::load(&filter_down_coefs[0]); + auto s = GSVector4i::load(&core.RevbDownBuf[right][index]); + auto acc = s.mul16hrs(c); + + c = GSVector4i::load(&filter_down_coefs[8]); + s = GSVector4i::load(&core.RevbDownBuf[right][index + 8]); + acc = acc.adds16(s.mul16hrs(c)); + + c = GSVector4i::load(&filter_down_coefs[16]); + s = GSVector4i::load(&core.RevbDownBuf[right][index + 16]); + acc = acc.adds16(s.mul16hrs(c)); + + c = GSVector4i::load(&filter_down_coefs[24]); + s = GSVector4i::load(&core.RevbDownBuf[right][index + 24]); + acc = acc.adds16(s.mul16hrs(c)); + + c = GSVector4i::load(&filter_down_coefs[32]); + s = GSVector4i::load(&core.RevbDownBuf[right][index + 32]); + acc = acc.adds16(s.mul16hrs(c)); + + acc = acc.hadds16(acc); + acc = acc.hadds16(acc); + acc = acc.hadds16(acc); + + return acc.I16[0]; +} + +s32 ReverbDownsample(V_Core& core, bool right) +{ +#if _M_SSE >= 0x501 + return ReverbDownsample_avx(core, right); +#else + return ReverbDownsample_sse(core, right); +#endif +} + +StereoOut32 __forceinline ReverbUpsample_reference(V_Core& core) +{ + int index = (core.RevbSampleBufPos - NUM_TAPS) & 63; + s32 l = 0, r = 0; + + for (int i = 0; i < NUM_TAPS; i++) + { + l += core.RevbUpBuf[0][index + i] * filter_up_coefs[i]; + r += core.RevbUpBuf[1][index + i] * filter_up_coefs[i]; + } + + l >>= 15; + r >>= 15; + + return {clamp_mix(l), clamp_mix(r)}; +} + +#if _M_SSE >= 0x501 +StereoOut32 __forceinline ReverbUpsample_avx(V_Core& core) +{ + int index = (core.RevbSampleBufPos - NUM_TAPS) & 63; + + auto c = GSVector8i::load(&filter_up_coefs[0]); + auto l = GSVector8i::load(&core.RevbUpBuf[0][index]); + auto r = GSVector8i::load(&core.RevbUpBuf[1][index]); + + auto lacc = l.mul16hrs(c); + auto racc = r.mul16hrs(c); + + c = GSVector8i::load(&filter_up_coefs[16]); + l = GSVector8i::load(&core.RevbUpBuf[0][index + 16]); + r = GSVector8i::load(&core.RevbUpBuf[1][index + 16]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + c = GSVector8i::load(&filter_up_coefs[32]); + l = GSVector8i::load(&core.RevbUpBuf[0][index + 32]); + r = GSVector8i::load(&core.RevbUpBuf[1][index + 32]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + lacc = lacc.adds16(lacc.ba()); + racc = racc.adds16(racc.ba()); + + lacc = lacc.hadds16(lacc); + lacc = lacc.hadds16(lacc); + lacc = lacc.hadds16(lacc); + + racc = racc.hadds16(racc); + racc = racc.hadds16(racc); + racc = racc.hadds16(racc); + + return {lacc.I16[0], racc.I16[0]}; +} +#endif + +StereoOut32 __forceinline ReverbUpsample_sse(V_Core& core) +{ + int index = (core.RevbSampleBufPos - NUM_TAPS) & 63; + + auto c = GSVector4i::load(&filter_up_coefs[0]); + auto l = GSVector4i::load(&core.RevbUpBuf[0][index]); + auto r = GSVector4i::load(&core.RevbUpBuf[1][index]); + + auto lacc = l.mul16hrs(c); + auto racc = r.mul16hrs(c); + + c = GSVector4i::load(&filter_up_coefs[8]); + l = GSVector4i::load(&core.RevbUpBuf[0][index + 8]); + r = GSVector4i::load(&core.RevbUpBuf[1][index + 8]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + c = GSVector4i::load(&filter_up_coefs[16]); + l = GSVector4i::load(&core.RevbUpBuf[0][index + 16]); + r = GSVector4i::load(&core.RevbUpBuf[1][index + 16]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + c = GSVector4i::load(&filter_up_coefs[24]); + l = GSVector4i::load(&core.RevbUpBuf[0][index + 24]); + r = GSVector4i::load(&core.RevbUpBuf[1][index + 24]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + c = GSVector4i::load(&filter_up_coefs[32]); + l = GSVector4i::load(&core.RevbUpBuf[0][index + 32]); + r = GSVector4i::load(&core.RevbUpBuf[1][index + 32]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + lacc = lacc.hadds16(lacc); + lacc = lacc.hadds16(lacc); + lacc = lacc.hadds16(lacc); + + racc = racc.hadds16(racc); + racc = racc.hadds16(racc); + racc = racc.hadds16(racc); + + return {lacc.I16[0], racc.I16[0]}; +} + +StereoOut32 ReverbUpsample(V_Core& core) +{ +#if _M_SSE >= 0x501 + return ReverbUpsample_avx(core); +#else + return ReverbUpsample_sse(core); +#endif +} + +MULTI_ISA_UNSHARED_END diff --git a/pcsx2/SPU2/defs.h b/pcsx2/SPU2/defs.h index dfd6845b47b77..b2c29e0d3f5f0 100644 --- a/pcsx2/SPU2/defs.h +++ b/pcsx2/SPU2/defs.h @@ -19,6 +19,8 @@ #include "SPU2/SndOut.h" #include "SPU2/Global.h" +#include "GS/MultiISA.h" + #include // -------------------------------------------------------------------------------------- @@ -486,9 +488,6 @@ struct V_Core StereoOut32 DoReverb(const StereoOut32& Input); s32 RevbGetIndexer(s32 offset); - s32 ReverbDownsample(bool right); - StereoOut32 ReverbUpsample(); - StereoOut32 ReadInput(); StereoOut32 ReadInput_HiFi(); @@ -537,6 +536,14 @@ struct V_Core void FinishDMAwrite(); }; +MULTI_ISA_DEF( + StereoOut32 ReverbUpsample(V_Core& core); + s32 ReverbDownsample(V_Core& core, bool right); +) + +extern StereoOut32 (*ReverbUpsample)(V_Core& core); +extern s32 (*ReverbDownsample)(V_Core& core, bool right); + extern V_Core Cores[2]; extern V_SPDIF Spdif; diff --git a/pcsx2/SPU2/spu2sys.cpp b/pcsx2/SPU2/spu2sys.cpp index 8cd2fc0a1ff18..875bebeb42b3a 100644 --- a/pcsx2/SPU2/spu2sys.cpp +++ b/pcsx2/SPU2/spu2sys.cpp @@ -46,6 +46,9 @@ int PlayMode; static bool has_to_call_irq[2] = { false, false }; static bool has_to_call_irq_dma[2] = { false, false }; +StereoOut32 (*ReverbUpsample)(V_Core& core); +s32 (*ReverbDownsample)(V_Core& core, bool right); + static bool psxmode = false; @@ -111,6 +114,9 @@ void V_Core::Init(int index) if (SPU2::MsgToConsole()) SPU2::ConLog("* SPU2: Init SPU2 core %d \n", index); + ReverbDownsample = MULTI_ISA_SELECT(ReverbDownsample); + ReverbUpsample = MULTI_ISA_SELECT(ReverbUpsample); + //memset(this, 0, sizeof(V_Core)); // Explicitly initializing variables instead. Mute = false; diff --git a/pcsx2/pcsx2.vcxproj b/pcsx2/pcsx2.vcxproj index 0e25c2a884a61..abe2e7c4c62e8 100644 --- a/pcsx2/pcsx2.vcxproj +++ b/pcsx2/pcsx2.vcxproj @@ -264,6 +264,7 @@ + @@ -872,4 +873,4 @@ - \ No newline at end of file + From 0b35a987ee97d953cd36da2977fef5d177935cef Mon Sep 17 00:00:00 2001 From: Ziemas Date: Thu, 19 Oct 2023 03:32:18 +0200 Subject: [PATCH 4/5] SPU2: Put clamp_mix in header for more inlining --- pcsx2/SPU2/Mixer.cpp | 10 ---------- pcsx2/SPU2/Mixer.h | 2 -- pcsx2/SPU2/defs.h | 12 +++++++++++- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/pcsx2/SPU2/Mixer.cpp b/pcsx2/SPU2/Mixer.cpp index 72b4b31c7c437..b0dbd70b0d95b 100644 --- a/pcsx2/SPU2/Mixer.cpp +++ b/pcsx2/SPU2/Mixer.cpp @@ -28,16 +28,6 @@ static const s32 tbl_XA_Factor[16][2] = {98, -55}, {122, -60}}; -__forceinline s32 clamp_mix(s32 x) -{ - return std::clamp(x, -0x8000, 0x7fff); -} - -__forceinline StereoOut32 clamp_mix(StereoOut32 sample) -{ - return StereoOut32(clamp_mix(sample.Left), clamp_mix(sample.Right)); -} - static void __forceinline XA_decode_block(s16* buffer, const s16* block, s32& prev1, s32& prev2) { const s32 header = *block; diff --git a/pcsx2/SPU2/Mixer.h b/pcsx2/SPU2/Mixer.h index 6e40f640bf2b9..11a4544e92def 100644 --- a/pcsx2/SPU2/Mixer.h +++ b/pcsx2/SPU2/Mixer.h @@ -16,5 +16,3 @@ #pragma once extern void Mix(); -extern s32 clamp_mix(s32 x); -extern StereoOut32 clamp_mix(StereoOut32 sample); diff --git a/pcsx2/SPU2/defs.h b/pcsx2/SPU2/defs.h index b2c29e0d3f5f0..2d08792db2b97 100644 --- a/pcsx2/SPU2/defs.h +++ b/pcsx2/SPU2/defs.h @@ -40,11 +40,21 @@ extern s16 spu2M_Read(u32 addr); extern void spu2M_Write(u32 addr, s16 value); extern void spu2M_Write(u32 addr, u16 value); -static inline s16 SignExtend16(u16 v) +static __forceinline s16 SignExtend16(u16 v) { return (s16)v; } +static __forceinline s32 clamp_mix(s32 x) +{ + return std::clamp(x, -0x8000, 0x7fff); +} + +static __forceinline StereoOut32 clamp_mix(StereoOut32 sample) +{ + return StereoOut32(clamp_mix(sample.Left), clamp_mix(sample.Right)); +} + struct V_VolumeLR { static V_VolumeLR Max; From 86f9226dbd838b368c1750bb6fab71b2896edefc Mon Sep 17 00:00:00 2001 From: Ziemas Date: Thu, 19 Oct 2023 04:05:34 +0200 Subject: [PATCH 5/5] SPU2: Pass reverb input by value --- pcsx2/SPU2/Reverb.cpp | 12 ++++++------ pcsx2/SPU2/defs.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pcsx2/SPU2/Reverb.cpp b/pcsx2/SPU2/Reverb.cpp index 9126d50148da1..8f4440ed248fd 100644 --- a/pcsx2/SPU2/Reverb.cpp +++ b/pcsx2/SPU2/Reverb.cpp @@ -66,19 +66,19 @@ __forceinline s32 V_Core::RevbGetIndexer(s32 offset) return x & 0xf'ffff; } -StereoOut32 V_Core::DoReverb(const StereoOut32& Input) +StereoOut32 V_Core::DoReverb(StereoOut32 Input) { if (EffectsStartA >= EffectsEndA) { return StereoOut32::Empty; } - auto input = clamp_mix(Input); + Input = clamp_mix(Input); - RevbDownBuf[0][RevbSampleBufPos] = input.Left; - RevbDownBuf[1][RevbSampleBufPos] = input.Right; - RevbDownBuf[0][RevbSampleBufPos | 64] = input.Left; - RevbDownBuf[1][RevbSampleBufPos | 64] = input.Right; + RevbDownBuf[0][RevbSampleBufPos] = Input.Left; + RevbDownBuf[1][RevbSampleBufPos] = Input.Right; + RevbDownBuf[0][RevbSampleBufPos | 64] = Input.Left; + RevbDownBuf[1][RevbSampleBufPos | 64] = Input.Right; bool R = Cycles & 1; diff --git a/pcsx2/SPU2/defs.h b/pcsx2/SPU2/defs.h index 2d08792db2b97..b2a8a5840c290 100644 --- a/pcsx2/SPU2/defs.h +++ b/pcsx2/SPU2/defs.h @@ -495,7 +495,7 @@ struct V_Core // -------------------------------------------------------------------------------------- StereoOut32 Mix(const VoiceMixSet& inVoices, const StereoOut32& Input, const StereoOut32& Ext); - StereoOut32 DoReverb(const StereoOut32& Input); + StereoOut32 DoReverb(StereoOut32 Input); s32 RevbGetIndexer(s32 offset); StereoOut32 ReadInput();