From 18f6da0c476cf27614679d5b6889d36fe79d2699 Mon Sep 17 00:00:00 2001
From: chriseth <chris@ethereum.org>
Date: Wed, 18 Dec 2024 20:09:51 +0100
Subject: [PATCH] Specialized code for goldilocks. (#2253)

This is mostly a reduced copy of the goldilocks implementation we
already have with the main difference that the division tries to perform
integer division first if it can be done without remainder.
---
 executor/src/witgen/jit/compiler.rs           | 262 +++-----------
 executor/src/witgen/jit/includes/README.txt   |   1 +
 .../jit/includes/field_generic_up_to_64.rs    | 122 +++++++
 .../witgen/jit/includes/field_goldilocks.rs   | 319 ++++++++++++++++++
 executor/src/witgen/jit/includes/interface.rs | 104 ++++++
 number/src/traits.rs                          |   9 +-
 6 files changed, 603 insertions(+), 214 deletions(-)
 create mode 100644 executor/src/witgen/jit/includes/README.txt
 create mode 100644 executor/src/witgen/jit/includes/field_generic_up_to_64.rs
 create mode 100644 executor/src/witgen/jit/includes/field_goldilocks.rs
 create mode 100644 executor/src/witgen/jit/includes/interface.rs
diff --git a/executor/src/witgen/jit/compiler.rs b/executor/src/witgen/jit/compiler.rs
index e92fbbb3f4..995c2247d1 100644
--- a/executor/src/witgen/jit/compiler.rs
+++ b/executor/src/witgen/jit/compiler.rs
@@ -4,7 +4,7 @@ use std::{ffi::c_void, iter, mem, sync::Arc};
 use auto_enums::auto_enum;
 use itertools::Itertools;
 use libloading::Library;
-use powdr_number::FieldElement;
+use powdr_number::{FieldElement, GoldilocksField, KnownField};
 
 use crate::witgen::{
     data_structures::{finalizable_data::CompactData, mutable_state::MutableState},
@@ -327,224 +327,55 @@ fn util_code<T: FieldElement>(first_column_id: u64, column_count: usize) -> Resu
         ));
     }
 
-    let int_type = if mem::size_of::<T>() == 8 {
-        "u64"
-    } else {
-        "u32"
-    };
-    let double_int_type = if mem::size_of::<T>() == 8 {
-        "u128"
-    } else {
-        "u64"
-    };
-    let modulus = T::modulus();
-
-    Ok(format!(
-        r#"#![allow(non_snake_case, unused_parens)]
-
-#[derive(Clone, Copy, Default)]
-#[repr(transparent)]
-struct FieldElement({int_type});
-
-type IntType = {int_type};
-type DoubleIntType = {double_int_type};
-const MODULUS: IntType = {modulus}_{int_type};
-
-impl std::fmt::Display for FieldElement {{
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {{
-        write!(f, "{{}}", self.0)
-    }}
-}}
-impl From<IntType> for FieldElement {{
-    #[inline]
-    fn from(i: IntType) -> Self {{
-        Self(i)
-    }}
-}}
-impl std::ops::Add for FieldElement {{
-    type Output = Self;
-    #[inline]
-    fn add(self, b: Self) -> Self {{
-        // TODO this is inefficient.
-        Self(IntType::try_from(((self.0 as DoubleIntType) + (b.0 as DoubleIntType)) % (MODULUS as DoubleIntType)).unwrap())
-    }}
-}}
-impl std::ops::Sub for FieldElement {{
-    type Output = Self;
-    #[inline]
-    fn sub(self, b: Self) -> Self {{
-        // TODO this is inefficient.
-        Self(IntType::try_from(((self.0 as DoubleIntType) + (MODULUS as DoubleIntType) - (b.0 as DoubleIntType)) % (MODULUS as DoubleIntType)).unwrap())
-    }}
-}}
-impl std::ops::Neg for FieldElement {{
-    type Output = Self;
-    #[inline]
-    fn neg(self) -> Self {{
-        if self.0 == 0 {{
-            self
-        }} else {{
-            Self(MODULUS - self.0)
-        }}
-    }}
-}}
-impl std::ops::Mul<FieldElement> for FieldElement {{
-    type Output = Self;
-    #[inline]
-    fn mul(self, b: FieldElement) -> FieldElement {{
-        // TODO this is inefficient.
-        Self(IntType::try_from(((self.0 as DoubleIntType) * (b.0 as DoubleIntType)) % (MODULUS as DoubleIntType)).unwrap())
-    }}
-}}
-impl std::ops::Div<FieldElement> for FieldElement {{
-    type Output = Self;
-    #[inline]
-    fn div(self, b: FieldElement) -> FieldElement {{
-        if b.0 == 0 {{
-            panic!("Division by zero");
-        }}
-
-        if let Some(result) = try_integer_div_without_remainder(self.0, b.0) {{
-            Self(result)
-        }} else if let Some(result) = try_integer_div_without_remainder(self.0, MODULUS - b.0) {{
-            Self(MODULUS - result)
-        }} else if let Some(result) = try_integer_div_without_remainder(MODULUS - self.0, b.0) {{
-            Self(MODULUS - result)
-        }} else if let Some(result) = try_integer_div_without_remainder(MODULUS - self.0, MODULUS - b.0) {{
-            Self(result)
-        }} else {{
-            full_field_div(self, b)
-        }}
-    }}
-}}
-#[inline]
-fn try_integer_div_without_remainder(a: IntType, b: IntType) -> Option<IntType> {{
-    (a % b == 0).then(|| a / b)
-}}
-fn full_field_div(_: FieldElement, _: FieldElement) -> FieldElement {{
-    todo!()
-    // TODO generate the algorithm we use for goldilocks
-    // for a generic prime field.
-}}
-#[inline]
-fn integer_div(a: FieldElement, b: FieldElement) -> FieldElement {{
-    FieldElement(a.0 / b.0)
-}}
-impl std::ops::BitAnd<FieldElement> for FieldElement {{
-    type Output = Self;
-    #[inline]
-    fn bitand(self, b: FieldElement) -> FieldElement {{
-        Self(self.0 & b.0)
-    }}
-}}
-impl std::ops::BitOr<FieldElement> for FieldElement {{
-    type Output = Self;
-    #[inline]
-    fn bitor(self, b: FieldElement) -> FieldElement {{
-        Self(self.0 | b.0)
-    }}
-}}
-
-#[inline]
-fn known_to_slice<'a>(known: *mut u32, len: u64) -> &'a mut [u32] {{
-    let words_per_row = ({column_count} + 31) / 32;
-    let rows = len / {column_count};
-    let known_len = rows * words_per_row;
-    unsafe {{ std::slice::from_raw_parts_mut(known, known_len as usize) }}
-}}
-
-#[inline]
-fn index(global_offset: u64, local_offset: i32, column: u64) -> usize {{
-    let column = column - {first_column_id};
-    let row = (global_offset as i64 + local_offset as i64) as u64;
-    (row * {column_count} + column) as usize
-}}
-
-#[inline]
-fn index_known(global_offset: u64, local_offset: i32, column: u64) -> (u64, u64) {{
-    let column = column - {first_column_id};
-    let row = (global_offset as i64 + local_offset as i64) as u64;
-    let words_per_row = ({column_count} + 31) / 32;
-    (row * words_per_row + column / 32, column % 32)
-}}
-
-#[inline]
-fn get(data: &[FieldElement], global_offset: u64, local_offset: i32, column: u64) -> FieldElement {{
-    data[index(global_offset, local_offset, column)]
-}}
-
-#[inline]
-fn set(data: &mut [FieldElement], global_offset: u64, local_offset: i32, column: u64, value: FieldElement) {{
-    let i = index(global_offset, local_offset, column);
-    data[i] = value;
-}}
-
-#[inline]
-fn set_known(known: &mut [u32], global_offset: u64, local_offset: i32, column: u64) {{
-    let (known_idx, known_bit) = index_known(global_offset, local_offset, column);
-    known[known_idx as usize] |= 1 << (known_bit);
-}}
-
-#[inline]
-fn get_param(params: &[LookupCell<FieldElement>], i: usize) -> FieldElement {{
-    match params[i] {{
-        LookupCell::Input(v) => *v,
-        LookupCell::Output(_) => panic!("Output cell used as input"),
-    }}
-}}
-#[inline]
-fn set_param(params: &mut [LookupCell<FieldElement>], i: usize, value: FieldElement) {{
-    match &mut params[i] {{
-        LookupCell::Input(_) => panic!("Input cell used as output"),
-        LookupCell::Output(v) => **v = value,
-    }}
-}}
-
-#[repr(C)]
-enum LookupCell<'a, T> {{
-    /// Value is known (i.e. an input)
-    Input(&'a T),
-    /// Value is not known (i.e. an output)
-    Output(&'a mut T),
-}}
-
-#[repr(C)]
-pub struct MutSlice<T> {{
-    data: *mut T,
-    len: u64,
-}}
+    let field_impl = match T::known_field() {
+        Some(KnownField::GoldilocksField) => {
+            include_str!("includes/field_goldilocks.rs").to_string()
+        }
+        _ => {
+            let int_type = if mem::size_of::<T>() == 8 {
+                "u64"
+            } else {
+                "u32"
+            };
+            let double_int_type = if mem::size_of::<T>() == 8 {
+                "u128"
+            } else {
+                "u64"
+            };
+            let modulus = T::modulus();
 
-impl<T> From<&mut [T]> for MutSlice<T> {{
-    #[inline]
-    fn from(slice: &mut [T]) -> Self {{
-        MutSlice {{
-            data: slice.as_mut_ptr(),
-            len: slice.len() as u64,
-        }}
-    }}
-}}
+            format!(
+                "\
+                #[derive(Clone, Copy, Default)]\n\
+                #[repr(transparent)]\n\
+                struct FieldElement({int_type});\n\
+                \n\
+                type IntType = {int_type};\n\
+                type DoubleIntType = {double_int_type};\n\
+                const MODULUS: IntType = {modulus}_{int_type};\n\
+                {}\
+                ",
+                include_str!("includes/field_generic_up_to_64.rs")
+            )
+        }
+    };
 
-impl<T> MutSlice<T> {{
-    #[inline]
-    fn to_mut_slice<'a>(self) -> &'a mut [T] {{
-        unsafe {{ std::slice::from_raw_parts_mut(self.data, self.len as usize) }}
-    }}  
-}}
+    let interface = format!(
+        "\
+        const column_count: u64 = {column_count};\n\
+        const first_column_id: u64 = {first_column_id};\n\
+        {}",
+        include_str!("includes/interface.rs")
+    );
 
-#[repr(C)]
-pub struct WitgenFunctionParams<'a, T: 'a> {{
-    data: MutSlice<T>,
-    known: *mut u32,
-    row_offset: u64,
-    params: MutSlice<LookupCell<'a, T>>,
-    call_machine: extern "C" fn(*const std::ffi::c_void, u64, MutSlice<LookupCell<'_, T>>) -> bool,
-}}
-    "#
+    Ok(format!(
+        "#![allow(non_snake_case, unused_parens, unused_variables)]\n{field_impl}\n{interface}"
     ))
 }
 
 #[cfg(test)]
 mod tests {
+    use powdr_number::KoalaBearField;
     use pretty_assertions::assert_eq;
 
     use powdr_number::GoldilocksField;
@@ -552,10 +383,17 @@ mod tests {
     use super::*;
 
     #[test]
-    fn compile_util_code() {
+    fn compile_util_code_goldilocks() {
         compile_effects::<GoldilocksField>(0, 2, &[], &[]).unwrap();
     }
 
+    // We would like to test the generic field implementation, but
+    // we need direct representation and this is not clear.
+    // #[test]
+    // fn compile_util_code_koalabear() {
+    //     compile_effects::<KoalaBearField>(0, 2, &[], &[]).unwrap();
+    // }
+
     fn cell(column_name: &str, id: u64, row_offset: i32) -> Variable {
         Variable::Cell(Cell {
             column_name: column_name.to_string(),
diff --git a/executor/src/witgen/jit/includes/README.txt b/executor/src/witgen/jit/includes/README.txt
new file mode 100644
index 0000000000..f45305ab78
--- /dev/null
+++ b/executor/src/witgen/jit/includes/README.txt
@@ -0,0 +1 @@
+These files will be included in the generated code, because of that, this directory is not part of the rust module tree.
\ No newline at end of file
diff --git a/executor/src/witgen/jit/includes/field_generic_up_to_64.rs b/executor/src/witgen/jit/includes/field_generic_up_to_64.rs
new file mode 100644
index 0000000000..47e14a7289
--- /dev/null
+++ b/executor/src/witgen/jit/includes/field_generic_up_to_64.rs
@@ -0,0 +1,122 @@
+// The following types are defined in the including code:
+// #[derive(Clone, Copy, Default)]
+// #[repr(transparent)]
+// struct FieldElement({int_type});
+
+// type IntType = {int_type};
+// type DoubleIntType = {double_int_type};
+// const MODULUS: IntType = {modulus}_{int_type};
+
+impl std::fmt::Display for FieldElement {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl From<IntType> for FieldElement {
+    #[inline]
+    fn from(i: IntType) -> Self {
+        Self(i)
+    }
+}
+impl std::ops::Add for FieldElement {
+    type Output = Self;
+    #[inline]
+    fn add(self, b: Self) -> Self {
+        // TODO this is inefficient.
+        Self(
+            IntType::try_from(
+                ((self.0 as DoubleIntType) + (b.0 as DoubleIntType)) % (MODULUS as DoubleIntType),
+            )
+            .unwrap(),
+        )
+    }
+}
+impl std::ops::Sub for FieldElement {
+    type Output = Self;
+    #[inline]
+    fn sub(self, b: Self) -> Self {
+        // TODO this is inefficient.
+        Self(
+            IntType::try_from(
+                ((self.0 as DoubleIntType) + (MODULUS as DoubleIntType) - (b.0 as DoubleIntType))
+                    % (MODULUS as DoubleIntType),
+            )
+            .unwrap(),
+        )
+    }
+}
+impl std::ops::Neg for FieldElement {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        if self.0 == 0 {
+            self
+        } else {
+            Self(MODULUS - self.0)
+        }
+    }
+}
+impl std::ops::Mul<FieldElement> for FieldElement {
+    type Output = Self;
+    #[inline]
+    fn mul(self, b: FieldElement) -> FieldElement {
+        // TODO this is inefficient.
+        Self(
+            IntType::try_from(
+                ((self.0 as DoubleIntType) * (b.0 as DoubleIntType)) % (MODULUS as DoubleIntType),
+            )
+            .unwrap(),
+        )
+    }
+}
+impl std::ops::Div<FieldElement> for FieldElement {
+    type Output = Self;
+    #[inline]
+    fn div(self, b: FieldElement) -> FieldElement {
+        if b.0 == 0 {
+            panic!("Division by zero");
+        }
+
+        if let Some(result) = try_integer_div_without_remainder(self.0, b.0) {
+            Self(result)
+        } else if let Some(result) = try_integer_div_without_remainder(self.0, MODULUS - b.0) {
+            Self(MODULUS - result)
+        } else if let Some(result) = try_integer_div_without_remainder(MODULUS - self.0, b.0) {
+            Self(MODULUS - result)
+        } else if let Some(result) =
+            try_integer_div_without_remainder(MODULUS - self.0, MODULUS - b.0)
+        {
+            Self(result)
+        } else {
+            full_field_div(self, b)
+        }
+    }
+}
+#[inline]
+fn try_integer_div_without_remainder(a: IntType, b: IntType) -> Option<IntType> {
+    (a % b == 0).then(|| a / b)
+}
+fn full_field_div(_: FieldElement, _: FieldElement) -> FieldElement {
+    todo!()
+    // TODO generate the algorithm we use for goldilocks
+    // for a generic prime field.
+}
+#[inline]
+fn integer_div(a: FieldElement, b: FieldElement) -> FieldElement {
+    FieldElement(a.0 / b.0)
+}
+impl std::ops::BitAnd<FieldElement> for FieldElement {
+    type Output = Self;
+    #[inline]
+    fn bitand(self, b: FieldElement) -> FieldElement {
+        Self(self.0 & b.0)
+    }
+}
+impl std::ops::BitOr<FieldElement> for FieldElement {
+    type Output = Self;
+    #[inline]
+    fn bitor(self, b: FieldElement) -> FieldElement {
+        Self(self.0 | b.0)
+    }
+}
diff --git a/executor/src/witgen/jit/includes/field_goldilocks.rs b/executor/src/witgen/jit/includes/field_goldilocks.rs
new file mode 100644
index 0000000000..e2d4a59def
--- /dev/null
+++ b/executor/src/witgen/jit/includes/field_goldilocks.rs
@@ -0,0 +1,319 @@
+#[derive(Clone, Copy, Default)]
+#[repr(transparent)]
+struct GoldilocksField(u64);
+
+type FieldElement = GoldilocksField;
+
+const EPSILON: u64 = (1 << 32) - 1;
+
+impl GoldilocksField {
+    const ORDER: u64 = 0xFFFFFFFF00000001;
+
+    /// Returns the inverse of the field element, using Fermat's little theorem.
+    /// The inverse of `a` is computed as `a^(p-2)`, where `p` is the prime order of the field.
+    ///
+    /// Mathematically, this is equivalent to:
+    ///                $a^(p-1)     = 1 (mod p)$
+    ///                $a^(p-2) * a = 1 (mod p)$
+    /// Therefore      $a^(p-2)     = a^-1 (mod p)$
+    ///
+    /// The following code has been adapted from winterfell/math/src/field/f64/mod.rs
+    /// located at <https://github.com/facebook/winterfell>.
+    fn try_inverse(&self) -> Option<Self> {
+        if self.0 == 0 {
+            return None;
+        }
+
+        // compute base^(P - 2) using 72 multiplications
+        // The exponent P - 2 is represented in binary as:
+        // 0b1111111111111111111111111111111011111111111111111111111111111111
+
+        // compute base^11
+        let t2 = self.square() * *self;
+
+        // compute base^111
+        let t3 = t2.square() * *self;
+
+        // compute base^111111 (6 ones)
+        // repeatedly square t3 3 times and multiply by t3
+        let t6 = exp_acc::<3>(t3, t3);
+
+        // compute base^111111111111 (12 ones)
+        // repeatedly square t6 6 times and multiply by t6
+        let t12 = exp_acc::<6>(t6, t6);
+
+        // compute base^111111111111111111111111 (24 ones)
+        // repeatedly square t12 12 times and multiply by t12
+        let t24 = exp_acc::<12>(t12, t12);
+
+        // compute base^1111111111111111111111111111111 (31 ones)
+        // repeatedly square t24 6 times and multiply by t6 first. then square t30 and
+        // multiply by base
+        let t30 = exp_acc::<6>(t24, t6);
+        let t31 = t30.square() * *self;
+
+        // compute base^111111111111111111111111111111101111111111111111111111111111111
+        // repeatedly square t31 32 times and multiply by t31
+        let t63 = exp_acc::<32>(t31, t31);
+
+        // compute base^1111111111111111111111111111111011111111111111111111111111111111
+        Some(t63.square() * *self)
+    }
+
+    fn square(&self) -> Self {
+        *self * *self
+    }
+
+    fn exp_power_of_2(&self, power_log: usize) -> Self {
+        let mut res = *self;
+        for _ in 0..power_log {
+            res = res.square();
+        }
+        res
+    }
+
+    #[inline(always)]
+    fn from_canonical_u64(n: u64) -> Self {
+        debug_assert!(n < Self::ORDER);
+        Self(n)
+    }
+
+    #[inline]
+    fn to_canonical_u64(self) -> u64 {
+        self.0
+    }
+}
+
+#[inline]
+fn wrap(x: u64) -> u64 {
+    if x >= GoldilocksField::ORDER {
+        x - GoldilocksField::ORDER
+    } else {
+        x
+    }
+}
+
+impl std::ops::Neg for GoldilocksField {
+    type Output = Self;
+
+    #[inline]
+    fn neg(self) -> Self {
+        if self.0 == 0 {
+            self
+        } else {
+            Self(Self::ORDER - self.0)
+        }
+    }
+}
+
+impl std::ops::Add for GoldilocksField {
+    type Output = Self;
+
+    #[inline]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn add(self, rhs: Self) -> Self {
+        let (sum, over) = self.0.overflowing_add(rhs.0);
+        let (sum, over) = sum.overflowing_add((over as u64) * EPSILON);
+        debug_assert!(!over);
+        Self(wrap(sum))
+    }
+}
+
+impl std::ops::Sub for GoldilocksField {
+    type Output = Self;
+
+    #[inline]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn sub(self, rhs: Self) -> Self {
+        let (diff, under) = self.0.overflowing_sub(rhs.0);
+        let (diff, under) = diff.overflowing_sub((under as u64) * EPSILON);
+        debug_assert!(!under);
+        Self(wrap(diff))
+    }
+}
+
+impl std::ops::Mul for GoldilocksField {
+    type Output = Self;
+
+    fn mul(self, rhs: Self) -> Self {
+        reduce128((self.0 as u128) * (rhs.0 as u128))
+    }
+}
+
+impl std::ops::Div for GoldilocksField {
+    type Output = Self;
+
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn div(self, b: Self) -> Self::Output {
+        if b.0 == 0 {
+            panic!("Division by zero");
+        }
+        if self.0 == 0 {
+            return self;
+        }
+
+        let MODULUS = Self::ORDER;
+
+        if let Some(result) = try_integer_div_without_remainder(self.0, b.0) {
+            Self(result)
+        } else if let Some(result) = try_integer_div_without_remainder(self.0, MODULUS - b.0) {
+            Self(MODULUS - result)
+        } else if let Some(result) = try_integer_div_without_remainder(MODULUS - self.0, b.0) {
+            Self(MODULUS - result)
+        } else if let Some(result) =
+            try_integer_div_without_remainder(MODULUS - self.0, MODULUS - b.0)
+        {
+            Self(result)
+        } else {
+            full_field_div(self, b)
+        }
+    }
+}
+
+#[inline]
+fn try_integer_div_without_remainder(a: u64, b: u64) -> Option<u64> {
+    (a % b == 0).then(|| a / b)
+}
+
+fn full_field_div(a: GoldilocksField, b: GoldilocksField) -> GoldilocksField {
+    a * b.try_inverse().unwrap()
+}
+
+/// Fast addition modulo ORDER for x86-64.
+/// This function is marked unsafe for the following reasons:
+///   - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001.
+///   - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in
+///     the registers, so its use is not recommended when either input will be used again.
+#[inline(always)]
+#[cfg(target_arch = "x86_64")]
+unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
+    let res_wrapped: u64;
+    let adjustment: u64;
+    core::arch::asm!(
+        "add {0}, {1}",
+        // Trick. The carry flag is set iff the addition overflowed.
+        // sbb x, y does x := x - y - CF. In our case, x and y are both {1:e}, so it simply does
+        // {1:e} := 0xffffffff on overflow and {1:e} := 0 otherwise. {1:e} is the low 32 bits of
+        // {1}; the high 32-bits are zeroed on write. In the end, we end up with 0xffffffff in {1}
+        // on overflow; this happens be EPSILON.
+        // Note that the CPU does not realize that the result of sbb x, x does not actually depend
+        // on x. We must write the result to a register that we know to be ready. We have a
+        // dependency on {1} anyway, so let's use it.
+        "sbb {1:e}, {1:e}",
+        inlateout(reg) x => res_wrapped,
+        inlateout(reg) y => adjustment,
+        options(pure, nomem, nostack),
+    );
+    assume(x != 0 || (res_wrapped == y && adjustment == 0));
+    assume(y != 0 || (res_wrapped == x && adjustment == 0));
+    // Add EPSILON == subtract ORDER.
+    // Cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
+    res_wrapped + adjustment
+}
+
+#[inline(always)]
+#[cfg(not(target_arch = "x86_64"))]
+const unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
+    let (res_wrapped, carry) = x.overflowing_add(y);
+    // Below cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
+    res_wrapped + EPSILON * (carry as u64)
+}
+
+/// Reduces to a 64-bit value. The result is in canonical form.
+#[inline]
+fn reduce128(x: u128) -> GoldilocksField {
+    let (x_lo, x_hi) = split(x); // This is a no-op
+    let x_hi_hi = x_hi >> 32;
+    let x_hi_lo = x_hi & EPSILON;
+
+    let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi);
+    if borrow {
+        branch_hint(); // A borrow is exceedingly rare. It is faster to branch.
+        t0 -= EPSILON; // Cannot underflow.
+    }
+    let t1 = x_hi_lo * EPSILON;
+    let t2 = unsafe { add_no_canonicalize_trashing_input(t0, t1) };
+
+    GoldilocksField(wrap(t2))
+}
+
+/// Squares the base N number of times and multiplies the result by the tail value.
+#[inline(always)]
+fn exp_acc<const N: usize>(base: GoldilocksField, tail: GoldilocksField) -> GoldilocksField {
+    base.exp_power_of_2(N) * tail
+}
+
+#[inline]
+const fn split(x: u128) -> (u64, u64) {
+    (x as u64, (x >> 64) as u64)
+}
+
+#[inline(always)]
+#[cfg(target_arch = "x86_64")]
+fn assume(p: bool) {
+    debug_assert!(p);
+    if !p {
+        unsafe {
+            core::hint::unreachable_unchecked();
+        }
+    }
+}
+
+/// Try to force Rust to emit a branch. Example:
+///     if x > 2 {
+///         y = foo();
+///         branch_hint();
+///     } else {
+///         y = bar();
+///     }
+/// This function has no semantics. It is a hint only.
+#[inline(always)]
+fn branch_hint() {
+    // NOTE: These are the currently supported assembly architectures. See the
+    // [nightly reference](https://doc.rust-lang.org/nightly/reference/inline-assembly.html) for
+    // the most up-to-date list.
+    #[cfg(any(
+        target_arch = "aarch64",
+        target_arch = "arm",
+        target_arch = "riscv32",
+        target_arch = "riscv64",
+        target_arch = "x86",
+        target_arch = "x86_64",
+    ))]
+    unsafe {
+        core::arch::asm!("", options(nomem, nostack, preserves_flags));
+    }
+}
+
+impl From<u64> for GoldilocksField {
+    #[inline]
+    fn from(n: u64) -> Self {
+        Self(wrap(n))
+    }
+}
+
+impl std::fmt::Display for GoldilocksField {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+#[inline]
+fn integer_div(a: GoldilocksField, b: GoldilocksField) -> GoldilocksField {
+    GoldilocksField(a.0 / b.0)
+}
+
+impl std::ops::BitAnd<GoldilocksField> for GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn bitand(self, b: GoldilocksField) -> GoldilocksField {
+        Self(self.0 & b.0)
+    }
+}
+impl std::ops::BitOr<GoldilocksField> for GoldilocksField {
+    type Output = Self;
+    #[inline]
+    fn bitor(self, b: GoldilocksField) -> GoldilocksField {
+        Self(self.0 | b.0)
+    }
+}
diff --git a/executor/src/witgen/jit/includes/interface.rs b/executor/src/witgen/jit/includes/interface.rs
new file mode 100644
index 0000000000..e960c7b471
--- /dev/null
+++ b/executor/src/witgen/jit/includes/interface.rs
@@ -0,0 +1,104 @@
+// These constants are defined in the including code.
+// const column_count: u64 = ...;
+// const first_column_id: u64 = ...;
+
+#[inline]
+fn known_to_slice<'a>(known: *mut u32, len: u64) -> &'a mut [u32] {
+    let words_per_row = (column_count + 31) / 32;
+    let rows = len / column_count;
+    let known_len = rows * words_per_row;
+    unsafe { std::slice::from_raw_parts_mut(known, known_len as usize) }
+}
+
+#[inline]
+fn index(global_offset: u64, local_offset: i32, column: u64) -> usize {
+    let column = column - first_column_id;
+    let row = (global_offset as i64 + local_offset as i64) as u64;
+    (row * column_count + column) as usize
+}
+
+#[inline]
+fn index_known(global_offset: u64, local_offset: i32, column: u64) -> (u64, u64) {
+    let column = column - first_column_id;
+    let row = (global_offset as i64 + local_offset as i64) as u64;
+    let words_per_row = (column_count + 31) / 32;
+    (row * words_per_row + column / 32, column % 32)
+}
+
+#[inline]
+fn get(data: &[FieldElement], global_offset: u64, local_offset: i32, column: u64) -> FieldElement {
+    data[index(global_offset, local_offset, column)]
+}
+
+#[inline]
+fn set(
+    data: &mut [FieldElement],
+    global_offset: u64,
+    local_offset: i32,
+    column: u64,
+    value: FieldElement,
+) {
+    let i = index(global_offset, local_offset, column);
+    data[i] = value;
+}
+
+#[inline]
+fn set_known(known: &mut [u32], global_offset: u64, local_offset: i32, column: u64) {
+    let (known_idx, known_bit) = index_known(global_offset, local_offset, column);
+    known[known_idx as usize] |= 1 << (known_bit);
+}
+
+#[inline]
+fn get_param(params: &[LookupCell<FieldElement>], i: usize) -> FieldElement {
+    match params[i] {
+        LookupCell::Input(v) => *v,
+        LookupCell::Output(_) => panic!("Output cell used as input"),
+    }
+}
+#[inline]
+fn set_param(params: &mut [LookupCell<FieldElement>], i: usize, value: FieldElement) {
+    match &mut params[i] {
+        LookupCell::Input(_) => panic!("Input cell used as output"),
+        LookupCell::Output(v) => **v = value,
+    }
+}
+
+#[repr(C)]
+enum LookupCell<'a, T> {
+    /// Value is known (i.e. an input)
+    Input(&'a T),
+    /// Value is not known (i.e. an output)
+    Output(&'a mut T),
+}
+
+#[repr(C)]
+pub struct MutSlice<T> {
+    data: *mut T,
+    len: u64,
+}
+
+impl<T> From<&mut [T]> for MutSlice<T> {
+    #[inline]
+    fn from(slice: &mut [T]) -> Self {
+        MutSlice {
+            data: slice.as_mut_ptr(),
+            len: slice.len() as u64,
+        }
+    }
+}
+
+impl<T> MutSlice<T> {
+    #[inline]
+    fn to_mut_slice<'a>(self) -> &'a mut [T] {
+        unsafe { std::slice::from_raw_parts_mut(self.data, self.len as usize) }
+    }
+}
+
+#[repr(C)]
+pub struct WitgenFunctionParams<'a, T: 'a> {
+    data: MutSlice<T>,
+    known: *mut u32,
+    row_offset: u64,
+    params: MutSlice<LookupCell<'a, T>>,
+    call_machine: extern "C" fn(*const std::ffi::c_void, u64, MutSlice<LookupCell<'_, T>>) -> bool,
+}
diff --git a/number/src/traits.rs b/number/src/traits.rs
index 55809d1ad1..643430da7a 100644
--- a/number/src/traits.rs
+++ b/number/src/traits.rs
@@ -187,8 +187,13 @@ pub trait FieldElement:
     fn try_into_i32(&self) -> Option<i32>;
 
     /// Returns `true` if values of this type are directly stored as their integer
-    /// value (i.e. not in montgomery representation and there are also no
-    /// additional fields), i.e. the `to_integer` function can be implemented as
+    /// value, i.e
+    /// - montgomery representation is not used
+    /// - values are always canonical (i.e. smaller than the modulus)
+    /// - there are no additional fields and
+    /// - `repr(transparent)` is used.
+    ///
+    /// In other words, the `to_integer` function can be implemented as
     /// a mem::transmute operation on pointers.
     fn has_direct_repr() -> bool;
 }