Skip to content

Commit

Permalink
I grow annoyed
Browse files Browse the repository at this point in the history
  • Loading branch information
shssoichiro committed May 4, 2022
1 parent 4e99fc0 commit a22fb64
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 44 deletions.
57 changes: 29 additions & 28 deletions src/denoise/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::api::FrameQueue;
use crate::cpu_features::CpuFeatureLevel;
use crate::util::{Aligned, AlignedBoxedSlice};
use crate::EncoderStatus;
use arrayvec::ArrayVec;
use cfg_if::cfg_if;
Expand Down Expand Up @@ -296,13 +297,13 @@ where
) -> &mut (R2cFftHandler<f32>, FftHandler<f32>, FftHandler<f32>);

fn do_filtering(&mut self, src: &[[Plane<T>; 3]], dest: &mut Frame<T>) {
let mut dftr = [0f32; BLOCK_VOLUME];
let mut dftc = [Complex::<f32>::default(); COMPLEX_COUNT];
let mut means = [Complex::<f32>::default(); COMPLEX_COUNT];
let mut dftr = Aligned::new([0f32; BLOCK_VOLUME]);
let mut dftc = Aligned::new([Complex::<f32>::default(); COMPLEX_COUNT]);
let mut means = Aligned::new([Complex::<f32>::default(); COMPLEX_COUNT]);

for p in 0..3 {
let (pad_width, pad_height) = self.pad_dimensions(p);
let mut ebuff = vec![0f32; pad_width * pad_height];
let mut ebuff = AlignedBoxedSlice::new(pad_width * pad_height, 0f32);
let effective_height = self.effective_height(p);
let src_stride = src[0][p].cfg.stride;
let ebuff_stride = pad_width;
Expand All @@ -324,23 +325,23 @@ where
self.proc0(
src_planes[z].get_unchecked(x..),
self.hw().get_unchecked((BLOCK_AREA * z)..),
dftr.get_unchecked_mut((BLOCK_AREA * z)..),
dftr.data.get_unchecked_mut((BLOCK_AREA * z)..),
src_stride,
SB_SIZE,
self.src_scale(),
);
}

self.real_to_complex_3d(&dftr, &mut dftc);
self.remove_mean(&mut dftc, self.dftgc(), &mut means);
self.real_to_complex_3d(&dftr.data, &mut dftc.data);
self.remove_mean(&mut dftc.data, self.dftgc(), &mut means.data);

self.filter_coeffs(&mut dftc);
self.filter_coeffs(&mut dftc.data);

self.add_mean(&mut dftc, &means);
self.complex_to_real_3d(&dftc, &mut dftr);
self.add_mean(&mut dftc.data, &means.data);
self.complex_to_real_3d(&dftc.data, &mut dftr.data);

self.proc1(
dftr.get_unchecked((TB_MIDPOINT * BLOCK_AREA)..),
dftr.data.get_unchecked((TB_MIDPOINT * BLOCK_AREA)..),
self.hw().get_unchecked((TB_MIDPOINT * BLOCK_AREA)..),
ebuff.get_unchecked_mut((y * ebuff_stride + x)..),
SB_SIZE,
Expand Down Expand Up @@ -405,7 +406,7 @@ where
let s0 = s0.add(u * p0 + v);
let s1 = s1.add(u * p0 + v);
let dest = dest.add(u * p1 + v);
dest.write(dest.read() + s0.read() * s1.read());
dest.write(s0.read().mul_add(s1.read(), dest.read()));
}
}
}
Expand Down Expand Up @@ -693,10 +694,10 @@ where
pad_dimensions: ArrayVec<(usize, usize), 3>,
effective_heights: ArrayVec<usize, 3>,

hw: [f32; BLOCK_VOLUME],
dftgc: [Complex<f32>; COMPLEX_COUNT],
hw: Aligned<[f32; BLOCK_VOLUME]>,
dftgc: Aligned<[Complex<f32>; COMPLEX_COUNT]>,
fft: (R2cFftHandler<f32>, FftHandler<f32>, FftHandler<f32>),
sigmas: [f32; CCNT2],
sigmas: Aligned<[f32; CCNT2]>,
}

impl<T> DftDenoiserRust<T>
Expand All @@ -708,8 +709,8 @@ where
pad_dimensions: ArrayVec<(usize, usize), 3>,
effective_heights: ArrayVec<usize, 3>,
) -> Self {
let hw = create_window();
let mut dftgr = [0f32; BLOCK_VOLUME];
let hw = Aligned::new(create_window());
let mut dftgr = Aligned::new([0f32; BLOCK_VOLUME]);

let fft = (
R2cFftHandler::new(SB_SIZE),
Expand All @@ -719,15 +720,15 @@ where

let mut wscale = 0.0f32;
for k in 0..BLOCK_VOLUME {
dftgr[k] = 255.0 * hw[k];
wscale += hw[k].powi(2);
dftgr.data[k] = 255.0 * hw.data[k];
wscale += hw.data[k].powi(2);
}
let wscale = 1.0 / wscale;

let mut sigmas = [0f32; CCNT2];
sigmas.fill(sigma / wscale);
let mut sigmas = Aligned::new([0f32; CCNT2]);
sigmas.data.fill(sigma / wscale);

let mut denoiser = DftDenoiserRust {
let mut denoiser = Self {
dest_scale,
src_scale,
peak,
Expand All @@ -736,11 +737,11 @@ where
hw,
fft,
sigmas,
dftgc: [Complex::default(); COMPLEX_COUNT],
dftgc: Aligned::new([Complex::default(); COMPLEX_COUNT]),
};

let mut dftgc = [Complex::default(); COMPLEX_COUNT];
denoiser.real_to_complex_3d(&dftgr, &mut dftgc);
let mut dftgc = Aligned::new([Complex::default(); COMPLEX_COUNT]);
denoiser.real_to_complex_3d(&dftgr.data, &mut dftgc.data);
denoiser.dftgc = dftgc;

denoiser
Expand Down Expand Up @@ -778,17 +779,17 @@ where

#[inline(always)]
fn hw(&self) -> &[f32; BLOCK_VOLUME] {
&self.hw
&self.hw.data
}

#[inline(always)]
fn dftgc(&self) -> &[Complex<f32>; COMPLEX_COUNT] {
&self.dftgc
&self.dftgc.data
}

#[inline(always)]
fn sigmas(&self) -> &[f32; CCNT2] {
&self.sigmas
&self.sigmas.data
}

#[inline(always)]
Expand Down
73 changes: 57 additions & 16 deletions src/denoise/x86.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
use crate::util::Aligned;
use arrayvec::ArrayVec;
use ndrustfft::{Complex, FftHandler, R2cFftHandler};
use std::arch::x86_64::{
_mm256_castsi256_ps, _mm256_cvtepu16_epi32, _mm256_cvtepu8_epi32,
_mm256_load_ps, _mm256_mul_ps, _mm256_set1_ps, _mm256_store_ps,
_mm_load_si128,
};
use std::mem::size_of;
use v_frame::pixel::Pixel;

use super::{
Expand All @@ -19,10 +26,10 @@ where
pad_dimensions: ArrayVec<(usize, usize), 3>,
effective_heights: ArrayVec<usize, 3>,

hw: [f32; BLOCK_VOLUME],
dftgc: [Complex<f32>; COMPLEX_COUNT],
hw: Aligned<[f32; BLOCK_VOLUME]>,
dftgc: Aligned<[Complex<f32>; COMPLEX_COUNT]>,
fft: (R2cFftHandler<f32>, FftHandler<f32>, FftHandler<f32>),
sigmas: [f32; CCNT2],
sigmas: Aligned<[f32; CCNT2]>,
}

impl<T> DftDenoiserAvx2<T>
Expand All @@ -35,8 +42,8 @@ where
pad_dimensions: ArrayVec<(usize, usize), 3>,
effective_heights: ArrayVec<usize, 3>,
) -> Self {
let hw = create_window();
let mut dftgr = [0f32; BLOCK_VOLUME];
let hw = Aligned::new(create_window());
let mut dftgr = Aligned::new([0f32; BLOCK_VOLUME]);

let fft = (
R2cFftHandler::new(SB_SIZE),
Expand All @@ -46,15 +53,15 @@ where

let mut wscale = 0.0f32;
for k in 0..BLOCK_VOLUME {
dftgr[k] = 255.0 * hw[k];
wscale += hw[k].powi(2);
dftgr.data[k] = 255.0 * hw.data[k];
wscale += hw.data[k].powi(2);
}
let wscale = 1.0 / wscale;

let mut sigmas = [0f32; CCNT2];
sigmas.fill(sigma / wscale);
let mut sigmas = Aligned::new([0f32; CCNT2]);
sigmas.data.fill(sigma / wscale);

let mut denoiser = DftDenoiserAvx2 {
let mut denoiser = Self {
dest_scale,
src_scale,
peak,
Expand All @@ -63,11 +70,11 @@ where
hw,
fft,
sigmas,
dftgc: [Complex::default(); COMPLEX_COUNT],
dftgc: Aligned::new([Complex::default(); COMPLEX_COUNT]),
};

let mut dftgc = [Complex::default(); COMPLEX_COUNT];
denoiser.real_to_complex_3d(&dftgr, &mut dftgc);
let mut dftgc = Aligned::new([Complex::default(); COMPLEX_COUNT]);
denoiser.real_to_complex_3d(&dftgr.data, &mut dftgc.data);
denoiser.dftgc = dftgc;

denoiser
Expand All @@ -78,6 +85,40 @@ impl<T> Denoiser<T> for DftDenoiserAvx2<T>
where
T: Pixel,
{
#[inline]
unsafe fn proc0(
&self, s0: &[T], s1: &[f32], dest: &mut [f32], p0: usize, p1: usize,
src_scale: f32,
) {
let s0 = s0.as_ptr();
let s1 = s1.as_ptr();
let dest = dest.as_mut_ptr();
let src_scale = _mm256_set1_ps(src_scale);

for u in 0..p1 {
for v in (0..p1).step_by(8) {
let s0 = s0.add(u * p0 + v);
let s1 = s1.add(u * p1 + v);
let dest = dest.add(u * p1 + v);

let v_s0 = if size_of::<T>() == 1 {
_mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_load_si128(s0.cast())))
} else {
_mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_load_si128(s0.cast())))
};
let v_s1 = _mm256_load_ps(s1);
if size_of::<T>() == 1 {
_mm256_store_ps(dest, _mm256_mul_ps(v_s0, v_s1));
} else {
_mm256_store_ps(
dest,
_mm256_mul_ps(_mm256_mul_ps(v_s0, src_scale), v_s1),
);
}
}
}
}

#[inline(always)]
fn pad_dimensions(&self, plane: usize) -> (usize, usize) {
self.pad_dimensions[plane]
Expand Down Expand Up @@ -105,17 +146,17 @@ where

#[inline(always)]
fn hw(&self) -> &[f32; BLOCK_VOLUME] {
&self.hw
&self.hw.data
}

#[inline(always)]
fn dftgc(&self) -> &[Complex<f32>; COMPLEX_COUNT] {
&self.dftgc
&self.dftgc.data
}

#[inline(always)]
fn sigmas(&self) -> &[f32; CCNT2] {
&self.sigmas
&self.sigmas.data
}

#[inline(always)]
Expand Down

0 comments on commit a22fb64

Please sign in to comment.