diff --git a/src/core/backend/avx512/fft.rs b/src/core/backend/avx512/fft.rs
new file mode 100644
index 000000000..20e8b79dc
--- /dev/null
+++ b/src/core/backend/avx512/fft.rs
@@ -0,0 +1,140 @@
+use std::arch::x86_64::{
+    __m512i, _mm512_add_epi32, _mm512_min_epu32, _mm512_mul_epu32, _mm512_permutex2var_epi32,
+    _mm512_srli_epi64, _mm512_sub_epi32,
+};
+
+/// L is an input to _mm512_permutex2var_epi32, and is used to interleave the even words of a
+/// with the even words of b.
+const L: __m512i = unsafe {
+    core::mem::transmute([
+        0b00000, 0b10000, 0b00010, 0b10010, 0b00100, 0b10100, 0b00110, 0b10110, 0b01000, 0b11000,
+        0b01010, 0b11010, 0b01100, 0b11100, 0b01110, 0b11110,
+    ])
+};
+/// H is an input to _mm512_permutex2var_epi32, and is used to interleave the odd words of a
+/// with the odd words of b.
+const H: __m512i = unsafe {
+    core::mem::transmute([
+        0b00001, 0b10001, 0b00011, 0b10011, 0b00101, 0b10101, 0b00111, 0b10111, 0b01001, 0b11001,
+        0b01011, 0b11011, 0b01101, 0b11101, 0b01111, 0b11111,
+    ])
+};
+const P: __m512i = unsafe { core::mem::transmute([(1u32 << 31) - 1; 16]) };
+
+/// Computes the butterfly operation for packed M31 elements.
+///   val0 + t val1, val0 - t val1.
+/// val0, val1 are packed M31 elements. 16 M31 words at each.
+/// Each value is assumed to be in unreduced form, [0, P] including P.
+/// Returned values are in unreduced form, [0, P] including P.
+/// twiddle_dbl holds 16 values, each is a *double* of a twiddle factor, in reduced form.
+/// # Safety
+/// This function is safe.
+pub unsafe fn avx_butterfly(
+    val0: __m512i,
+    val1: __m512i,
+    twiddle_dbl: __m512i,
+) -> (__m512i, __m512i) {
+    // Set up a word s.t. the lower half of each 64-bit word has the even 32-bit words of val0.
+    let val1_e = val1;
+    // Set up a word s.t. the lower half of each 64-bit word has the odd 32-bit words of val0.
+    let val1_o = _mm512_srli_epi64(val1, 32);
+    let twiddle_dbl_e = twiddle_dbl;
+    let twiddle_dbl_o = _mm512_srli_epi64(twiddle_dbl, 32);
+
+    // To compute prod = val1 * twiddle start by multipling
+    // val1_e/o by twiddle_dbl_e/o.
+    let prod_e_dbl = _mm512_mul_epu32(val1_e, twiddle_dbl_e);
+    let prod_o_dbl = _mm512_mul_epu32(val1_o, twiddle_dbl_o);
+
+    // The result of a multiplication holds val1*twiddle_dbl in as 64-bits.
+    // Each 64b-bit word looks like this:
+    //               1    31       31    1
+    // prod_e_dbl - |0|prod_e_h|prod_e_l|0|
+    // prod_o_dbl - |0|prod_o_h|prod_o_l|0|
+
+    // Interleave the even words of prod_e_dbl with the even words of prod_o_dbl:
+    let prod_ls = _mm512_permutex2var_epi32(prod_e_dbl, L, prod_o_dbl);
+    // prod_ls -    |prod_o_l|0|prod_e_l|0|
+
+    // Divide by 2:
+    let prod_ls = _mm512_srli_epi64(prod_ls, 1);
+    // prod_ls -    |0|prod_o_l|0|prod_e_l|
+
+    // Interleave the odd words of prod_e_dbl with the odd words of prod_o_dbl:
+    let prod_hs = _mm512_permutex2var_epi32(prod_e_dbl, H, prod_o_dbl);
+    // prod_hs -    |0|prod_o_h|0|prod_e_h|
+
+    let prod = add_mod_p(prod_ls, prod_hs);
+
+    let r0 = add_mod_p(val0, prod);
+    let r1 = sub_mod_p(val0, prod);
+
+    (r0, r1)
+}
+
+// TODO(spapini): Move these to M31 AVX.
+
+/// Adds two packed M31 elements, and reduces the result to the range [0,P].
+/// Each value is assumed to be in unreduced form, [0, P] including P.
+/// # Safety
+/// This function is safe.
+pub unsafe fn add_mod_p(a: __m512i, b: __m512i) -> __m512i {
+    // Add word by word. Each word is in the range [0, 2P].
+    let c = _mm512_add_epi32(a, b);
+    // Apply min(c, c-P) to each word.
+    // When c in [P,2P], then c-P in [0,P] which is always less than [P,2P].
+    // When c in [0,P-1], then c-P in [2^32-P,2^32-1] which is always greater than [0,P-1].
+    _mm512_min_epu32(c, _mm512_sub_epi32(c, P))
+}
+
+/// Subtracts two packed M31 elements, and reduces the result to the range [0,P].
+/// Each value is assumed to be in unreduced form, [0, P] including P.
+/// # Safety
+/// This function is safe.
+pub unsafe fn sub_mod_p(a: __m512i, b: __m512i) -> __m512i {
+    // Subtract word by word. Each word is in the range [-P, P].
+    let c = _mm512_sub_epi32(a, b);
+    // Apply min(c, c+P) to each word.
+    // When c in [0,P], then c+P in [P,2P] which is always greater than [0,P].
+    // When c in [2^32-P,2^32-1], then c+P in [0,P-1] which is always less than [2^32-P,2^32-1].
+    _mm512_min_epu32(_mm512_add_epi32(c, P), c)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::arch::x86_64::_mm512_setr_epi32;
+
+    use super::*;
+    use crate::core::fft::butterfly;
+    use crate::core::fields::m31::BaseField;
+
+    #[test]
+    fn test_butterfly() {
+        unsafe {
+            let val0 = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+            let val1 = _mm512_setr_epi32(
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            );
+            let twiddle = _mm512_setr_epi32(
+                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+            );
+            let twiddle_dbl = _mm512_add_epi32(twiddle, twiddle);
+            let (r0, r1) = avx_butterfly(val0, val1, twiddle_dbl);
+
+            let val0: [u32; 16] = std::mem::transmute(val0);
+            let val1: [u32; 16] = std::mem::transmute(val1);
+            let twiddle: [u32; 16] = std::mem::transmute(twiddle);
+            let r0: [u32; 16] = std::mem::transmute(r0);
+            let r1: [u32; 16] = std::mem::transmute(r1);
+
+            for i in 0..16 {
+                let mut x = BaseField::from_u32_unchecked(val0[i]);
+                let mut y = BaseField::from_u32_unchecked(val1[i]);
+                let twiddle = BaseField::from_u32_unchecked(twiddle[i]);
+                butterfly(&mut x, &mut y, twiddle);
+                assert_eq!(x, BaseField::from_u32_unchecked(r0[i]));
+                assert_eq!(y, BaseField::from_u32_unchecked(r1[i]));
+            }
+        }
+    }
+}
diff --git a/src/core/backend/avx512/mod.rs b/src/core/backend/avx512/mod.rs
index 14984916d..b259c12a8 100644
--- a/src/core/backend/avx512/mod.rs
+++ b/src/core/backend/avx512/mod.rs
@@ -1,6 +1,7 @@
 pub mod bit_reverse;
 pub mod circle;
 pub mod cm31;
+pub mod fft;
 pub mod m31;
 pub mod qm31;