Skip to content

Commit

Permalink
Merge pull request #632 from wheremyfoodat/more-dsp
Browse files Browse the repository at this point in the history
WIP: Finishing DSP mixer
  • Loading branch information
wheremyfoodat authored Nov 20, 2024
2 parents 43991b7 + b78450c commit b2c0f18
Show file tree
Hide file tree
Showing 7 changed files with 370 additions and 56 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,7 @@ set(APPLET_SOURCE_FILES src/core/applets/applet.cpp src/core/applets/mii_selecto
)
set(AUDIO_SOURCE_FILES src/core/audio/dsp_core.cpp src/core/audio/null_core.cpp src/core/audio/teakra_core.cpp
src/core/audio/miniaudio_device.cpp src/core/audio/hle_core.cpp src/core/audio/aac_decoder.cpp
src/core/audio/audio_interpolation.cpp
)
set(RENDERER_SW_SOURCE_FILES src/core/renderer_sw/renderer_sw.cpp)

Expand Down Expand Up @@ -354,6 +355,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp
include/align.hpp include/audio/aac_decoder.hpp include/PICA/pica_simd.hpp include/services/fonts.hpp
include/audio/audio_interpolation.hpp include/audio/hle_mixer.hpp include/audio/dsp_simd.hpp
)

cmrc_add_resource_library(
Expand Down
58 changes: 58 additions & 0 deletions include/audio/audio_interpolation.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright 2016 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.

#pragma once

#include <array>
#include <deque>

#include "audio/hle_mixer.hpp"
#include "helpers.hpp"

namespace Audio::Interpolation {
// A variable length buffer of signed PCM16 stereo samples.
using StereoBuffer16 = std::deque<std::array<s16, 2>>;
using StereoFrame16 = Audio::DSPMixer::StereoFrame<s16>;

struct State {
// Two historical samples.
std::array<s16, 2> xn1 = {}; //< x[n-1]
std::array<s16, 2> xn2 = {}; //< x[n-2]
// Current fractional position.
u64 fposition = 0;
};

/**
* No interpolation. This is equivalent to a zero-order hold. There is a two-sample predelay.
* @param state Interpolation state.
* @param input Input buffer.
* @param rate Stretch factor. Must be a positive non-zero value.
* rate > 1.0 performs decimation and rate < 1.0 performs upsampling.
* @param output The resampled audio buffer.
* @param outputi The index of output to start writing to.
*/
void none(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi);

/**
* Linear interpolation. This is equivalent to a first-order hold. There is a two-sample predelay.
* @param state Interpolation state.
* @param input Input buffer.
* @param rate Stretch factor. Must be a positive non-zero value.
* rate > 1.0 performs decimation and rate < 1.0 performs upsampling.
* @param output The resampled audio buffer.
* @param outputi The index of output to start writing to.
*/
void linear(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi);

/**
* Polyphase interpolation. This is currently stubbed to just perform linear interpolation
* @param state Interpolation state.
* @param input Input buffer.
* @param rate Stretch factor. Must be a positive non-zero value.
* rate > 1.0 performs decimation and rate < 1.0 performs upsampling.
* @param output The resampled audio buffer.
* @param outputi The index of output to start writing to.
*/
void polyphase(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi);
} // namespace Audio::Interpolation
78 changes: 78 additions & 0 deletions include/audio/dsp_simd.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#pragma once

#include "audio/hle_mixer.hpp"
#include "compiler_builtins.hpp"
#include "helpers.hpp"

#if defined(_M_AMD64) || defined(__x86_64__)
#define DSP_SIMD_X64
#include <immintrin.h>
#elif defined(_M_ARM64) || defined(__aarch64__)
#define DSP_SIMD_ARM64
#include <arm_neon.h>
#endif

// Optimized SIMD functions for mixing the stereo output of a DSP voice into a quadraphonic intermediate mix
namespace DSP::MixIntoQuad {
using IntermediateMix = Audio::DSPMixer::IntermediateMix;
using StereoFrame16 = Audio::DSPMixer::StereoFrame<s16>;

// Non-SIMD, portable algorithm
ALWAYS_INLINE static void mixPortable(IntermediateMix& mix, StereoFrame16& frame, const float* gains) {
for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) {
// Mono samples are in the format: (l, r)
// When converting to quad, gain0 and gain2 are applied to the left sample, gain1 and gain3 to the right one
mix[sampleIndex][0] += s32(frame[sampleIndex][0] * gains[0]);
mix[sampleIndex][1] += s32(frame[sampleIndex][1] * gains[1]);
mix[sampleIndex][2] += s32(frame[sampleIndex][0] * gains[2]);
mix[sampleIndex][3] += s32(frame[sampleIndex][1] * gains[3]);
}
}

#if defined(DSP_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
ALWAYS_INLINE static void mixSSE4_1(IntermediateMix& mix, StereoFrame16& frame, const float* gains) {
__m128 gains_ = _mm_load_ps(gains);

for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) {
// The stereo samples, repeated every 4 bytes inside the vector register
__m128i stereoSamples = _mm_castps_si128(_mm_load1_ps((float*)&frame[sampleIndex][0]));

__m128 currentFrame = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(stereoSamples));
__m128i offset = _mm_cvttps_epi32(_mm_mul_ps(currentFrame, gains_));
__m128i intermediateMixPrev = _mm_load_si128((__m128i*)&mix[sampleIndex][0]);
__m128i result = _mm_add_epi32(intermediateMixPrev, offset);
_mm_store_si128((__m128i*)&mix[sampleIndex][0], result);
}
}
#endif

#ifdef DSP_SIMD_ARM64
ALWAYS_INLINE static void mixNEON(IntermediateMix& mix, StereoFrame16& frame, const float* gains) {
float32x4_t gains_ = vld1q_f32(gains);

for (usize sampleIndex = 0; sampleIndex < Audio::samplesInFrame; sampleIndex++) {
// Load l and r samples and repeat them every 4 bytes
int32x4_t stereoSamples = vld1q_dup_s32((s32*)&frame[sampleIndex][0]);
// Expand the bottom 4 s16 samples into an int32x4 with sign extension, then convert them to float32x4
float32x4_t currentFrame = vcvtq_f32_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(stereoSamples))));

// Multiply samples by their respective gains, truncate the result, and add it into the intermediate mix buffer
int32x4_t offset = vcvtq_s32_f32(vmulq_f32(currentFrame, gains_));
int32x4_t intermediateMixPrev = vld1q_s32((s32*)&mix[sampleIndex][0]);
int32x4_t result = vaddq_s32(intermediateMixPrev, offset);
vst1q_s32((s32*)&mix[sampleIndex][0], result);
}
}
#endif

// Mixes the stereo output of a DSP voice into a quadraphonic intermediate mix
static void mix(IntermediateMix& mix, StereoFrame16& frame, const float* gains) {
#if defined(DSP_SIMD_ARM64)
return mixNEON(mix, frame, gains);
#elif defined(DSP_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
return mixSSE4_1(mix, frame, gains);
#else
return mixPortable(mix, frame, gains);
#endif
}
} // namespace DSP::MixIntoQuad
64 changes: 21 additions & 43 deletions include/audio/hle_core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,13 @@

#include "audio/aac.hpp"
#include "audio/aac_decoder.hpp"
#include "audio/audio_interpolation.hpp"
#include "audio/dsp_core.hpp"
#include "audio/dsp_shared_mem.hpp"
#include "audio/hle_mixer.hpp"
#include "memory.hpp"

namespace Audio {
using SampleFormat = HLE::SourceConfiguration::Configuration::Format;
using SourceType = HLE::SourceConfiguration::Configuration::MonoOrStereo;

struct DSPSource {
// Audio buffer information
// https://www.3dbrew.org/wiki/DSP_Memory_Region
Expand Down Expand Up @@ -47,14 +46,29 @@ namespace Audio {

// Buffer of decoded PCM16 samples. TODO: Are there better alternatives to use over deque?
using SampleBuffer = std::deque<std::array<s16, 2>>;

using BufferQueue = std::priority_queue<Buffer>;
using InterpolationMode = HLE::SourceConfiguration::Configuration::InterpolationMode;
using InterpolationState = Audio::Interpolation::State;

// The samples this voice output for this audio frame.
// Aligned to 4 for SIMD purposes.
alignas(4) DSPMixer::StereoFrame<s16> currentFrame;
BufferQueue buffers;

SampleFormat sampleFormat = SampleFormat::ADPCM;
SourceType sourceType = SourceType::Stereo;
InterpolationMode interpolationMode = InterpolationMode::Linear;
InterpolationState interpolationState;

// There's one gain configuration for each of the 3 intermediate mixing stages
// And each gain configuration is composed of 4 gain values, one for each sample in a quad-channel sample
// Aligned to 16 for SIMD purposes
alignas(16) std::array<std::array<float, 4>, 3> gains;
// Of the 3 intermediate mix stages, typically only the first one is actually enabled and the other ones do nothing
// Ie their gain is vec4(0.0). We track which stages are disabled (have a gain of all 0s) using this bitfield and skip them
// In order to save up on CPU time.
uint enabledMixStages = 0;

std::array<float, 3> gain0, gain1, gain2;
u32 samplePosition; // Sample number into the current audio buffer
float rateMultiplier;
u16 syncCount;
Expand Down Expand Up @@ -95,42 +109,6 @@ namespace Audio {
DSPSource() { reset(); }
};

class DSPMixer {
public:
template <typename T, usize channelCount = 1>
using Sample = std::array<T, channelCount>;

template <typename T, usize channelCount>
using Frame = std::array<Sample<T, channelCount>, 160>;

template <typename T>
using MonoFrame = Frame<T, 1>;

template <typename T>
using StereoFrame = Frame<T, 2>;

template <typename T>
using QuadFrame = Frame<T, 4>;

private:
using ChannelFormat = HLE::DspConfiguration::OutputFormat;
// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
// Two of these intermediate mixers (second and third) are used for effects, including custom effects done on the CPU
static constexpr usize mixerStageCount = 3;

public:
ChannelFormat channelFormat = ChannelFormat::Stereo;
std::array<float, mixerStageCount> volumes;
std::array<bool, 2> enableAuxStages;

void reset() {
channelFormat = ChannelFormat::Stereo;

volumes.fill(0.0);
enableAuxStages.fill(false);
}
};

class HLE_DSP : public DSPCore {
// The audio frame types are public in case we want to use them for unit tests
public:
Expand All @@ -151,6 +129,7 @@ namespace Audio {

using Source = Audio::DSPSource;
using SampleBuffer = Source::SampleBuffer;
using IntermediateMix = DSPMixer::IntermediateMix;

private:
enum class DSPState : u32 {
Expand Down Expand Up @@ -218,7 +197,7 @@ namespace Audio {
void outputFrame();
// Perform the final mix, mixing the quadraphonic samples from all voices into the output audio frame
void performMix(Audio::HLE::SharedMemory& readRegion, Audio::HLE::SharedMemory& writeRegion);

// Decode an entire buffer worth of audio
void decodeBuffer(DSPSource& source);

Expand All @@ -245,5 +224,4 @@ namespace Audio {
void setSemaphore(u16 value) override {}
void setSemaphoreMask(u16 value) override {}
};

} // namespace Audio
50 changes: 50 additions & 0 deletions include/audio/hle_mixer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#pragma once
#include <array>

#include "audio/dsp_shared_mem.hpp"
#include "helpers.hpp"

namespace Audio {
using SampleFormat = HLE::SourceConfiguration::Configuration::Format;
using SourceType = HLE::SourceConfiguration::Configuration::MonoOrStereo;

class DSPMixer {
public:
template <typename T, usize channelCount = 1>
using Sample = std::array<T, channelCount>;

template <typename T, usize channelCount>
using Frame = std::array<Sample<T, channelCount>, 160>;

template <typename T>
using MonoFrame = Frame<T, 1>;

template <typename T>
using StereoFrame = Frame<T, 2>;

template <typename T>
using QuadFrame = Frame<T, 4>;

// Internally the DSP uses four channels when mixing.
// Neatly, QuadFrame<s32> means that every sample is a uint32x4 value, which is particularly nice for SIMD mixing
using IntermediateMix = QuadFrame<s32>;

private:
using ChannelFormat = HLE::DspConfiguration::OutputFormat;
// The audio from each DSP voice is converted to quadraphonic and then fed into 3 intermediate mixing stages
// Two of these intermediate mixers (second and third) are used for effects, including custom effects done on the CPU
static constexpr usize mixerStageCount = 3;

public:
ChannelFormat channelFormat = ChannelFormat::Stereo;
std::array<float, mixerStageCount> volumes;
std::array<bool, 2> enableAuxStages;

void reset() {
channelFormat = ChannelFormat::Stereo;

volumes.fill(0.0);
enableAuxStages.fill(false);
}
};
} // namespace Audio
73 changes: 73 additions & 0 deletions src/core/audio/audio_interpolation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// Copyright 2016 Citra Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.

#include "audio/audio_interpolation.hpp"

#include <algorithm>

#include "helpers.hpp"

namespace Audio::Interpolation {
// Calculations are done in fixed point with 24 fractional bits.
// (This is not verified. This was chosen for minimal error.)
static constexpr u64 scaleFactor = 1 << 24;
static constexpr u64 scaleMask = scaleFactor - 1;

/// Here we step over the input in steps of rate, until we consume all of the input.
/// Three adjacent samples are passed to fn each step.
template <typename Function>
static void stepOverSamples(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi, Function fn) {
if (input.empty()) {
return;
}

input.insert(input.begin(), {state.xn2, state.xn1});

const u64 step_size = static_cast<u64>(rate * scaleFactor);
u64 fposition = state.fposition;
usize inputi = 0;

while (outputi < output.size()) {
inputi = static_cast<usize>(fposition / scaleFactor);

if (inputi + 2 >= input.size()) {
inputi = input.size() - 2;
break;
}

u64 fraction = fposition & scaleMask;
output[outputi++] = fn(fraction, input[inputi], input[inputi + 1], input[inputi + 2]);

fposition += step_size;
}

state.xn2 = input[inputi];
state.xn1 = input[inputi + 1];
state.fposition = fposition - inputi * scaleFactor;

input.erase(input.begin(), std::next(input.begin(), inputi + 2));
}

void none(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi) {
stepOverSamples(state, input, rate, output, outputi, [](u64 fraction, const auto& x0, const auto& x1, const auto& x2) { return x0; });
}

void linear(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi) {
// Note on accuracy: Some values that this produces are +/- 1 from the actual firmware.
stepOverSamples(state, input, rate, output, outputi, [](u64 fraction, const auto& x0, const auto& x1, const auto& x2) {
// This is a saturated subtraction. (Verified by black-box fuzzing.)
s64 delta0 = std::clamp<s64>(x1[0] - x0[0], -32768, 32767);
s64 delta1 = std::clamp<s64>(x1[1] - x0[1], -32768, 32767);

return std::array<s16, 2>{
static_cast<s16>(x0[0] + fraction * delta0 / scaleFactor),
static_cast<s16>(x0[1] + fraction * delta1 / scaleFactor),
};
});
}

void polyphase(State& state, StereoBuffer16& input, float rate, StereoFrame16& output, usize& outputi) {
linear(state, input, rate, output, outputi);
}
} // namespace Audio::Interpolation
Loading

0 comments on commit b2c0f18

Please sign in to comment.