Skip to content

Commit

Permalink
Use a split filter for the FIR-based UHJ encoders
Browse files Browse the repository at this point in the history
This applies the all-pass filter in two steps, first as a relatively short
time-domain FIR filter, then as a series of frequency domain convolutions
(using complex multiplies). Time-domain convolution scales poorly, so larger
convolutions benefit from being done in the frequency domain (though the first
part is still done in the time domain, to avoid longer delays).
  • Loading branch information
kcat committed Oct 14, 2023
1 parent d3d2357 commit d157cf4
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 13 deletions.
4 changes: 4 additions & 0 deletions common/phase_shifter.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include "alspan.h"


struct NoInit { };

/* Implements a wide-band +90 degree phase-shift. Note that this should be
* given one sample less of a delay (FilterSize/2 - 1) compared to the direct
* signal delay (FilterSize/2) to properly align.
Expand Down Expand Up @@ -71,6 +73,8 @@ struct PhaseShifterT {
}
}

PhaseShifterT(NoInit) { }

void process(al::span<float> dst, const float *RESTRICT src) const;

private:
Expand Down
196 changes: 190 additions & 6 deletions core/uhjfilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "alcomplex.h"
#include "alnumeric.h"
#include "opthelpers.h"
#include "pffft.h"
#include "phase_shifter.h"


Expand All @@ -18,6 +19,120 @@ UhjQualityType UhjEncodeQuality{UhjQualityType::Default};

namespace {

struct PFFFTSetupDeleter {
void operator()(PFFFT_Setup *ptr) { pffft_destroy_setup(ptr); }
};
using PFFFTSetupPtr = std::unique_ptr<PFFFT_Setup,PFFFTSetupDeleter>;

/* Convolution is implemented using a segmented overlap-add method. The filter
* response is broken up into multiple segments of 128 samples, and each
* segment has an FFT applied with a 256-sample buffer (the latter half left
* silent) to get its frequency-domain response.
*
* Input samples are similarly broken up into 128-sample segments, with an FFT
* applied with a 256-sample buffer to each new incoming segment to get its
* frequency-domain response. A history of FFT'd input segments is maintained,
* equal to the number of filter response segments.
*
* To apply the convolution, each filter response segment is convolved with its
* paired input segment (using complex multiplies, far cheaper than time-domain
* FIRs), accumulating into an FFT buffer. The input history is then shifted to
* align with later filter response segments for the next input segment.
*
* An inverse FFT is then applied to the accumulated FFT buffer to get a 256-
* sample time-domain response for output, which is split in two halves. The
* first half is the 128-sample output, and the second half is a 128-sample
* (really, 127) delayed extension, which gets added to the output next time.
* Convolving two time-domain responses of length N results in a time-domain
* signal of length N*2 - 1, and this holds true regardless of the convolution
* being applied in the frequency domain, so these "overflow" samples need to
* be accounted for.
*
* To avoid a delay with gathering enough input samples to apply an FFT with,
* the first segment is applied directly in the time-domain as the samples come
* in. Once enough have been retrieved, the FFT is applied on the input and
* it's paired with the remaining (FFT'd) filter segments for processing.
*/
template<size_t N>
struct SplitFilter {
static constexpr size_t sFftLength{256};
static constexpr size_t sSampleLength{sFftLength / 2};
static constexpr size_t sNumSegments{N/sSampleLength - 1};
static_assert(N >= sFftLength);
static_assert((N % sSampleLength) == 0);

PhaseShifterT<sSampleLength> mPShift{NoInit{}};
PFFFTSetupPtr mFft;
alignas(16) std::array<float,sFftLength*sNumSegments> mFilterData;

SplitFilter()
{
mFft = PFFFTSetupPtr{pffft_new_setup(sFftLength, PFFFT_REAL)};

using complex_d = std::complex<double>;
constexpr size_t fft_size{N};
constexpr size_t half_size{fft_size / 2};

/* To set up the filter, we need to generate the desired response.
* Start with a pure delay that passes all frequencies through.
*/
auto fftBuffer = std::make_unique<complex_d[]>(fft_size);
std::fill_n(fftBuffer.get(), fft_size, complex_d{});
fftBuffer[half_size] = 1.0;

/* Convert to the frequency domain, shift the phase of each bin by +90
* degrees, then convert back to the time domain.
*/
forward_fft(al::span{fftBuffer.get(), fft_size});
fftBuffer[0] *= std::numeric_limits<double>::epsilon();
for(size_t i{1};i < half_size;++i)
fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()};
fftBuffer[half_size] *= std::numeric_limits<double>::epsilon();
for(size_t i{half_size+1};i < fft_size;++i)
fftBuffer[i] = std::conj(fftBuffer[fft_size - i]);
inverse_fft(al::span{fftBuffer.get(), fft_size});

/* Store the first segment of the buffer to apply as a time-domain
* filter (backwards for more efficient processing).
*/
auto fftiter = fftBuffer.get() + sSampleLength - 1;
for(float &coeff : mPShift.mCoeffs)
{
coeff = static_cast<float>(fftiter->real() / double{fft_size});
fftiter -= 2;
}

/* The remaining segments of the filter are converted back to the
* frequency domain, each on their own (0 stuffed).
*/
auto fftTmp = std::make_unique<float[]>(sFftLength);
float *filter{mFilterData.data()};
for(size_t s{0};s < sNumSegments;++s)
{
for(size_t i{0};i < sSampleLength;++i)
fftBuffer[i] = fftBuffer[sSampleLength*(s+1) + i].real() / double{fft_size};
std::fill_n(fftBuffer.get()+sSampleLength, sSampleLength, complex_d{});
forward_fft(al::span{fftBuffer.get(), sFftLength});

/* Convert to zdomain data for PFFFT, scaled by the FFT length so
* the iFFT result will be normalized.
*/
for(size_t i{0};i < sSampleLength;++i)
{
fftTmp[i*2 + 0] = static_cast<float>(fftBuffer[i].real()) / float{sFftLength};
fftTmp[i*2 + 1] = static_cast<float>((i == 0) ? fftBuffer[sSampleLength].real()
: fftBuffer[i].imag()) / float{sFftLength};
}
pffft_zreorder(mFft.get(), fftTmp.get(), filter, PFFFT_BACKWARD);
filter += sFftLength;
}
}
};

template<size_t N>
const SplitFilter<N> gSplitFilter;


const PhaseShifterT<UhjLength256> PShiftLq{};
const PhaseShifterT<UhjLength512> PShiftHq{};

Expand Down Expand Up @@ -87,7 +202,9 @@ template<size_t N>
void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
const al::span<const float*const,3> InSamples, const size_t SamplesToDo)
{
const auto &PShift = GetPhaseShifter<N>::Get();
static constexpr auto &Filter = gSplitFilter<N>;
static_assert(sFftLength == Filter.sFftLength);
static_assert(sNumSegments == Filter.sNumSegments);

ASSUME(SamplesToDo > 0);

Expand All @@ -104,10 +221,78 @@ void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
mS[i] = 0.9396926f*mW[i] + 0.1855740f*mX[i];

/* Precompute j(-0.3420201*W + 0.5098604*X) and store in mD. */
std::transform(winput, winput+SamplesToDo, xinput, mWX.begin() + sWXInOffset,
[](const float w, const float x) noexcept -> float
{ return -0.3420201f*w + 0.5098604f*x; });
PShift.process({mD.data(), SamplesToDo}, mWX.data());
for(size_t base{0};base < SamplesToDo;)
{
const size_t todo{minz(Filter.sSampleLength-mFifoPos, SamplesToDo-base)};

/* Transform the non-delayed input and store in the later half of the
* filter input.
*/
std::transform(winput+base, winput+base+todo, xinput+base,
mWXIn.begin()+Filter.sSampleLength + mFifoPos,
[](const float w, const float x) noexcept -> float
{ return -0.3420201f*w + 0.5098604f*x; });

/* Apply the first segment of the filter in the time domain. */
auto buf_iter = mD.begin() + base;
Filter.mPShift.process({buf_iter, todo}, mWXIn.begin()+1 + mFifoPos);

/* Add the other segments of the filter that were previously processed
* by the FFT.
*/
auto fifo_iter = mWXOut.begin() + mFifoPos;
std::transform(fifo_iter, fifo_iter+todo, buf_iter, buf_iter, std::plus<>{});

mFifoPos += todo;
base += todo;

/* Check whether the input buffer is filled with new samples. */
if(mFifoPos < Filter.sSampleLength) break;
mFifoPos = 0;

/* Move the newest input to the front for the next iteration's history. */
std::copy(mWXIn.cbegin()+Filter.sSampleLength, mWXIn.cend(), mWXIn.begin());
std::fill(mWXIn.begin()+Filter.sSampleLength, mWXIn.end(), 0.0f);

/* Overwrite the stale/oldest FFT'd segment with the newest input. */
const size_t curseg{mCurrentSegment};
pffft_transform(Filter.mFft.get(), mWXIn.data(),
mWXHistory.data() + curseg*Filter.sFftLength, mWorkData.data(), PFFFT_FORWARD);

/* Convolve each input segment with its IR filter counterpart (aligned
* in time).
*/
mFftBuffer.fill(0.0f);
const float *filter{Filter.mFilterData.data()};
const float *input{mWXHistory.data() + curseg*Filter.sFftLength};
for(size_t s{curseg};s < sNumSegments;++s)
{
pffft_zconvolve_accumulate(Filter.mFft.get(), input, filter, mFftBuffer.data());
input += Filter.sFftLength;
filter += Filter.sFftLength;
}
input = mWXHistory.data();
for(size_t s{0};s < curseg;++s)
{
pffft_zconvolve_accumulate(Filter.mFft.get(), input, filter, mFftBuffer.data());
input += Filter.sFftLength;
filter += Filter.sFftLength;
}

/* Convert back to samples, writing to the output and storing the extra
* for next time.
*/
pffft_transform(Filter.mFft.get(), mFftBuffer.data(), mFftBuffer.data(),
mWorkData.data(), PFFFT_BACKWARD);

for(size_t i{0};i < Filter.sSampleLength;++i)
mWXOut[i] = mFftBuffer[i] + mWXOut[Filter.sSampleLength+i];
for(size_t i{0};i < Filter.sSampleLength;++i)
mWXOut[Filter.sSampleLength+i] = mFftBuffer[Filter.sSampleLength+i];

/* Shift the input history. */
mCurrentSegment = curseg ? (curseg-1) : (sNumSegments-1);
}

/* D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y */
for(size_t i{0};i < SamplesToDo;++i)
Expand All @@ -117,7 +302,6 @@ void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
std::copy(mW.cbegin()+SamplesToDo, mW.cbegin()+SamplesToDo+sFilterDelay, mW.begin());
std::copy(mX.cbegin()+SamplesToDo, mX.cbegin()+SamplesToDo+sFilterDelay, mX.begin());
std::copy(mY.cbegin()+SamplesToDo, mY.cbegin()+SamplesToDo+sFilterDelay, mY.begin());
std::copy(mWX.cbegin()+SamplesToDo, mWX.cbegin()+SamplesToDo+sWXInOffset, mWX.begin());

/* Apply a delay to the existing output to align with the input delay. */
auto *delayBuffer = mDirectDelay.data();
Expand Down
17 changes: 10 additions & 7 deletions core/uhjfilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ struct UhjEncoderBase {
template<size_t N>
struct UhjEncoder final : public UhjEncoderBase {
static constexpr size_t sFilterDelay{N/2};
static constexpr size_t sFftLength{256};
static constexpr size_t sSegmentSize{sFftLength/2};
static constexpr size_t sNumSegments{N/sSegmentSize - 1};

/* Delays and processing storage for the input signal. */
alignas(16) std::array<float,BufferLineSize+sFilterDelay> mW{};
Expand All @@ -60,11 +63,13 @@ struct UhjEncoder final : public UhjEncoderBase {
alignas(16) std::array<float,BufferLineSize> mS{};
alignas(16) std::array<float,BufferLineSize> mD{};

/* History and temp storage for the FIR filter. New samples should be
* written to index sFilterDelay*2 - 1.
*/
static constexpr size_t sWXInOffset{sFilterDelay*2 - 1};
alignas(16) std::array<float,BufferLineSize + sFilterDelay*2> mWX{};
/* History and temp storage for the convolution filter. */
size_t mFifoPos{}, mCurrentSegment{};
alignas(16) std::array<float,sFftLength> mWXIn{};
alignas(16) std::array<float,sFftLength> mWXOut{};
alignas(16) std::array<float,sFftLength> mFftBuffer{};
alignas(16) std::array<float,sFftLength> mWorkData{};
alignas(16) std::array<float,sFftLength*sNumSegments> mWXHistory{};

alignas(16) std::array<std::array<float,sFilterDelay>,2> mDirectDelay{};

Expand All @@ -77,8 +82,6 @@ struct UhjEncoder final : public UhjEncoderBase {
*/
void encode(float *LeftOut, float *RightOut, const al::span<const float*const,3> InSamples,
const size_t SamplesToDo) override;

DEF_NEWDEL(UhjEncoder)
};

struct UhjEncoderIIR final : public UhjEncoderBase {
Expand Down

0 comments on commit d157cf4

Please sign in to comment.