From 31fd712e84fe486d902e8e39f39bfdcc42392c20 Mon Sep 17 00:00:00 2001 From: Jed Brown Date: Thu, 14 Nov 2024 23:05:26 -0700 Subject: [PATCH] bench fft: closer C++ code to Rust and use -fno-plt Note that -mtune=cascadelake is only about tuning (like instruction timing) not arch/features. --- .../benchmarks/ReverseMode/fft/Makefile.make | 4 +-- enzyme/benchmarks/ReverseMode/fft/fft.h | 25 +++++++++++-------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/enzyme/benchmarks/ReverseMode/fft/Makefile.make b/enzyme/benchmarks/ReverseMode/fft/Makefile.make index 774f4565359..c3c17ec3779 100644 --- a/enzyme/benchmarks/ReverseMode/fft/Makefile.make +++ b/enzyme/benchmarks/ReverseMode/fft/Makefile.make @@ -12,10 +12,10 @@ $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a: src/lib.rs Cargo.toml cargo +enzyme rustc --release --lib --crate-type=staticlib fft.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a - clang++ $(LOADCLANG) $(BENCH) -DCPP=1 -O3 -fno-math-errno $^ $(BENCHLINK) -lm -o $@ + clang++ $(LOADCLANG) $(BENCH) -DCPP=1 -O3 -fno-math-errno -fno-plt -mtune=cascadelake -g $^ $(BENCHLINK) -lm -o $@ fftr.o: fft.cpp $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a - clang++ $(LOADCLANG) $(BENCH) -O3 -fno-math-errno $^ $(BENCHLINK) -lm -o $@ + clang++ $(LOADCLANG) $(BENCH) -O3 -fno-math-errno -fno-plt -mtune=cascadelake -g $^ $(BENCHLINK) -lm -o $@ results.json: fft.o fftr.o numactl -C 1 ./fft.o 1048576 | tee results.json diff --git a/enzyme/benchmarks/ReverseMode/fft/fft.h b/enzyme/benchmarks/ReverseMode/fft/fft.h index 4ee9ef0de8f..fad3c7dad14 100644 --- a/enzyme/benchmarks/ReverseMode/fft/fft.h +++ b/enzyme/benchmarks/ReverseMode/fft/fft.h @@ -27,7 +27,7 @@ inline void swap(double *a, double *b) { *b = temp; } -static void recursiveApply(double *data, size_t N, int iSign) { +static void recursiveApply(double *__restrict data, size_t N, int iSign) { if (N == 1) return; recursiveApply(data, N / 2, iSign); @@ -39,17 +39,20 @@ static void recursiveApply(double *data, size_t N, int iSign) { double wr = 1.0; double wi = 0.0; - for (size_t ii = 0; ii < N / 2; ii++) { - size_t i = 2 * ii; + for (size_t i = 0; i < N; i += 2) { size_t iN = i + N; - - double tempr = data[iN] * wr - data[iN + 1] * wi; - double tempi = data[iN] * wi + data[iN + 1] * wr; - - data[iN] = data[i] - tempr; - data[iN + 1] = data[i + 1] - tempi; - data[i] += tempr; - data[i + 1] += tempi; + double *__restrict ay = &data[i + 1]; + double *__restrict ax = &data[i]; + double *__restrict by = &data[iN + 1]; + double *__restrict bx = &data[iN]; + + double tempr = *bx * wr - *by * wi; + double tempi = *bx * wi + *by * wr; + + *bx = *ax - tempr; + *by = *ay - tempi; + *ax += tempr; + *ay += tempi; wtemp = wr; wr = wr * (wpr + 1.) - wi * wpi;