Skip to content

Commit

Permalink
SPU2: Optimize reverb resampling [SAVEVERSION+]
Browse files Browse the repository at this point in the history
  • Loading branch information
Ziemas committed Oct 18, 2023
1 parent b74e566 commit b3bc3a0
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 31 deletions.
5 changes: 5 additions & 0 deletions pcsx2/GS/GSVector4i.h
Original file line number Diff line number Diff line change
Expand Up @@ -846,6 +846,11 @@ class alignas(16) GSVector4i
return GSVector4i(_mm_adds_epi16(m, v.m));
}

__forceinline GSVector4i hadds16(const GSVector4i& v) const
{
return GSVector4i(_mm_hadds_epi16(m, v.m));
}

__forceinline GSVector4i addus8(const GSVector4i& v) const
{
return GSVector4i(_mm_adds_epu8(m, v.m));
Expand Down
5 changes: 5 additions & 0 deletions pcsx2/GS/GSVector8i.h
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,11 @@ class alignas(32) GSVector8i
return GSVector8i(_mm256_adds_epi16(m, v.m));
}

__forceinline GSVector8i hadds16(const GSVector8i& v) const
{
return GSVector8i(_mm256_hadds_epi16(m, v.m));
}

__forceinline GSVector8i addus8(const GSVector8i& v) const
{
return GSVector8i(_mm256_adds_epu8(m, v.m));
Expand Down
166 changes: 138 additions & 28 deletions pcsx2/SPU2/Reverb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

#include "PrecompiledHeader.h"
#include "Global.h"
#include "GS/GSVector.h"

#include <array>


Expand Down Expand Up @@ -55,7 +57,7 @@ void V_Core::AnalyzeReverbPreset()

static constexpr u32 NUM_TAPS = 39;
// 39 tap filter, the 0's could be optimized out
static constexpr std::array<s32, NUM_TAPS> filter_coefs = {
static constexpr std::array<s16, 48> filter_down_coefs alignas(32) = {
-1,
0,
2,
Expand Down Expand Up @@ -97,39 +99,147 @@ static constexpr std::array<s32, NUM_TAPS> filter_coefs = {
-1,
};

s32 __forceinline V_Core::ReverbDownsample(bool right)
static constexpr std::array<s16, 48> make_up_coefs()
{
int index = (RevbSampleBufPos - NUM_TAPS) & 63;
s32 out = 0;
std::array<s16, 48> ret = {};

for (int i = 0; i < NUM_TAPS; i++)
for (u32 i = 0; i < NUM_TAPS; i++)
{
out += RevbDownBuf[right][index + i] * filter_coefs[i];
ret[i] = static_cast<s16>(std::clamp<s32>(filter_down_coefs[i] * 2, INT16_MIN, INT16_MAX));
}

out >>= 15;
out = std::clamp<s32>(out, INT16_MIN, INT16_MAX);

return out;
return ret;
}

StereoOut32 __forceinline V_Core::ReverbUpsample()
static constexpr std::array<s16, 48> filter_up_coefs alignas(32) = make_up_coefs();

s32 __forceinline V_Core::ReverbDownsample(bool right)
{
int index = (RevbSampleBufPos - NUM_TAPS) & 63;
s32 ls = 0, rs = 0;

for (int i = 0; i < NUM_TAPS; i++)
{
ls += RevbUpBuf[0][index + i] * (filter_coefs[i] * 2);
rs += RevbUpBuf[1][index + i] * (filter_coefs[i] * 2);
}
#if _M_SSE >= 0x501
auto c = GSVector8i::load<true>(&filter_down_coefs[0]);
auto s = GSVector8i::load<false>(&RevbDownBuf[right][index]);
auto acc = s.mul16hrs(c);

c = GSVector8i::load<true>(&filter_down_coefs[16]);
s = GSVector8i::load<false>(&RevbDownBuf[right][index + 16]);
acc = acc.adds16(s.mul16hrs(c));

c = GSVector8i::load<true>(&filter_down_coefs[32]);
s = GSVector8i::load<false>(&RevbDownBuf[right][index + 32]);
acc = acc.adds16(s.mul16hrs(c));

acc = acc.adds16(acc.ba());

ls >>= 15;
ls = std::clamp<s32>(ls, INT16_MIN, INT16_MAX);
rs >>= 15;
rs = std::clamp<s32>(rs, INT16_MIN, INT16_MAX);
acc = acc.hadds16(acc);
acc = acc.hadds16(acc);
acc = acc.hadds16(acc);
#else
auto c = GSVector4i::load<true>(&filter_down_coefs[0]);
auto s = GSVector4i::load<false>(&RevbDownBuf[right][index]);
auto acc = s.mul16hrs(c);

c = GSVector4i::load<true>(&filter_down_coefs[8]);
s = GSVector4i::load<false>(&RevbDownBuf[right][index + 8]);
acc = acc.adds16(s.mul16hrs(c));

c = GSVector4i::load<true>(&filter_down_coefs[16]);
s = GSVector4i::load<false>(&RevbDownBuf[right][index + 16]);
acc = acc.adds16(s.mul16hrs(c));

c = GSVector4i::load<true>(&filter_down_coefs[24]);
s = GSVector4i::load<false>(&RevbDownBuf[right][index + 24]);
acc = acc.adds16(s.mul16hrs(c));

c = GSVector4i::load<true>(&filter_down_coefs[32]);
s = GSVector4i::load<false>(&RevbDownBuf[right][index + 32]);
acc = acc.adds16(s.mul16hrs(c));

acc = acc.hadds16(acc);
acc = acc.hadds16(acc);
acc = acc.hadds16(acc);
#endif

return acc.I16[0];
}

StereoOut32 __forceinline V_Core::ReverbUpsample()
{
int index = (RevbSampleBufPos - NUM_TAPS) & 63;

return {ls, rs};
#if _M_SSE >= 0x501
auto c = GSVector8i::load<true>(&filter_up_coefs[0]);
auto l = GSVector8i::load<false>(&RevbUpBuf[0][index]);
auto r = GSVector8i::load<false>(&RevbUpBuf[1][index]);

auto lacc = l.mul16hrs(c);
auto racc = r.mul16hrs(c);

c = GSVector8i::load<true>(&filter_up_coefs[16]);
l = GSVector8i::load<false>(&RevbUpBuf[0][index + 16]);
r = GSVector8i::load<false>(&RevbUpBuf[1][index + 16]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));

c = GSVector8i::load<true>(&filter_up_coefs[32]);
l = GSVector8i::load<false>(&RevbUpBuf[0][index + 32]);
r = GSVector8i::load<false>(&RevbUpBuf[1][index + 32]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));

lacc = lacc.adds16(lacc.ba());
racc = racc.adds16(racc.ba());

lacc = lacc.hadds16(lacc);
lacc = lacc.hadds16(lacc);
lacc = lacc.hadds16(lacc);

racc = racc.hadds16(racc);
racc = racc.hadds16(racc);
racc = racc.hadds16(racc);
#else
auto c = GSVector4i::load<true>(&filter_up_coefs[0]);
auto l = GSVector4i::load<false>(&RevbUpBuf[0][index]);
auto r = GSVector4i::load<false>(&RevbUpBuf[1][index]);

auto lacc = l.mul16hrs(c);
auto racc = r.mul16hrs(c);

c = GSVector4i::load<true>(&filter_up_coefs[8]);
l = GSVector4i::load<false>(&RevbUpBuf[0][index + 8]);
r = GSVector4i::load<false>(&RevbUpBuf[1][index + 8]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));

c = GSVector4i::load<true>(&filter_up_coefs[16]);
l = GSVector4i::load<false>(&RevbUpBuf[0][index + 16]);
r = GSVector4i::load<false>(&RevbUpBuf[1][index + 16]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));

c = GSVector4i::load<true>(&filter_up_coefs[24]);
l = GSVector4i::load<false>(&RevbUpBuf[0][index + 24]);
r = GSVector4i::load<false>(&RevbUpBuf[1][index + 24]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));

c = GSVector4i::load<true>(&filter_up_coefs[32]);
l = GSVector4i::load<false>(&RevbUpBuf[0][index + 32]);
r = GSVector4i::load<false>(&RevbUpBuf[1][index + 32]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));

lacc = lacc.hadds16(lacc);
lacc = lacc.hadds16(lacc);
lacc = lacc.hadds16(lacc);

racc = racc.hadds16(racc);
racc = racc.hadds16(racc);
racc = racc.hadds16(racc);
#endif

return {lacc.I16[0], racc.I16[0]};
}

__forceinline s32 V_Core::RevbGetIndexer(s32 offset)
Expand All @@ -151,10 +261,10 @@ StereoOut32 V_Core::DoReverb(const StereoOut32& Input)
return StereoOut32::Empty;
}

RevbDownBuf[0][RevbSampleBufPos] = Input.Left;
RevbDownBuf[1][RevbSampleBufPos] = Input.Right;
RevbDownBuf[0][RevbSampleBufPos + 64] = Input.Left;
RevbDownBuf[1][RevbSampleBufPos + 64] = Input.Right;
RevbDownBuf[0][RevbSampleBufPos] = clamp_mix(Input.Left);
RevbDownBuf[1][RevbSampleBufPos] = clamp_mix(Input.Right);
RevbDownBuf[0][RevbSampleBufPos | 64] = clamp_mix(Input.Left);
RevbDownBuf[1][RevbSampleBufPos | 64] = clamp_mix(Input.Right);

bool R = Cycles & 1;

Expand Down Expand Up @@ -237,8 +347,8 @@ StereoOut32 V_Core::DoReverb(const StereoOut32& Input)
RevbUpBuf[R][RevbSampleBufPos] = clamp_mix(out);
RevbUpBuf[!R][RevbSampleBufPos] = 0;

RevbUpBuf[R][RevbSampleBufPos + 64] = clamp_mix(out);
RevbUpBuf[!R][RevbSampleBufPos + 64] = 0;
RevbUpBuf[R][RevbSampleBufPos | 64] = clamp_mix(out);
RevbUpBuf[!R][RevbSampleBufPos | 64] = 0;

RevbSampleBufPos = (RevbSampleBufPos + 1) & 63;

Expand Down
4 changes: 2 additions & 2 deletions pcsx2/SPU2/defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -422,8 +422,8 @@ struct V_Core

V_Reverb Revb; // Reverb Registers

s32 RevbDownBuf[2][64 * 2]; // Downsample buffer for reverb, one for each channel
s32 RevbUpBuf[2][64 * 2]; // Upsample buffer for reverb, one for each channel
s16 RevbDownBuf[2][64 * 2]; // Downsample buffer for reverb, one for each channel
s16 RevbUpBuf[2][64 * 2]; // Upsample buffer for reverb, one for each channel
u32 RevbSampleBufPos;
u32 EffectsStartA;
u32 EffectsEndA;
Expand Down
2 changes: 1 addition & 1 deletion pcsx2/SaveState.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ enum class FreezeAction
// [SAVEVERSION+]
// This informs the auto updater that the users savestates will be invalidated.

static const u32 g_SaveVersion = (0x9A47 << 16) | 0x0000;
static const u32 g_SaveVersion = (0x9A48 << 16) | 0x0000;


// the freezing data between submodules and core
Expand Down

0 comments on commit b3bc3a0

Please sign in to comment.