From a1230f74d0661fbc32584639919641d905e54d6d Mon Sep 17 00:00:00 2001
From: Linda Guiga <lindaguiga3@gmail.com>
Date: Mon, 2 Oct 2023 14:08:32 -0400
Subject: [PATCH 1/4] Remove extra shift CTL.

---
 evm/src/all_stark.rs                   |   5 +-
 evm/src/arithmetic/arithmetic_stark.rs |   3 +
 evm/src/arithmetic/divmod.rs           |  14 +-
 evm/src/arithmetic/mod.rs              |  43 ++-
 evm/src/arithmetic/mul.rs              |   6 +-
 evm/src/arithmetic/shift.rs            | 394 +++++++++++++++++++++++++
 evm/src/cpu/cpu_stark.rs               |  32 +-
 evm/src/witness/operation.rs           |   6 -
 8 files changed, 448 insertions(+), 55 deletions(-)
 create mode 100644 evm/src/arithmetic/shift.rs
diff --git a/evm/src/all_stark.rs b/evm/src/all_stark.rs
index b7168f8571..e5f631e81d 100644
--- a/evm/src/all_stark.rs
+++ b/evm/src/all_stark.rs
@@ -104,10 +104,7 @@ pub(crate) fn all_cross_table_lookups<F: Field>() -> Vec<CrossTableLookup<F>> {
 
 fn ctl_arithmetic<F: Field>() -> CrossTableLookup<F> {
     CrossTableLookup::new(
-        vec![
-            cpu_stark::ctl_arithmetic_base_rows(),
-            cpu_stark::ctl_arithmetic_shift_rows(),
-        ],
+        vec![cpu_stark::ctl_arithmetic_base_rows()],
         arithmetic_stark::ctl_arithmetic_rows(),
     )
 }
diff --git a/evm/src/arithmetic/arithmetic_stark.rs b/evm/src/arithmetic/arithmetic_stark.rs
index f38aab9ddb..3d281c868c 100644
--- a/evm/src/arithmetic/arithmetic_stark.rs
+++ b/evm/src/arithmetic/arithmetic_stark.rs
@@ -12,6 +12,7 @@ use plonky2::util::transpose;
 use static_assertions::const_assert;
 
 use super::columns::NUM_ARITH_COLUMNS;
+use super::shift;
 use crate::all_stark::Table;
 use crate::arithmetic::columns::{RANGE_COUNTER, RC_FREQUENCIES, SHARED_COLS};
 use crate::arithmetic::{addcy, byte, columns, divmod, modular, mul, Operation};
@@ -208,6 +209,7 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticSta
         divmod::eval_packed(lv, nv, yield_constr);
         modular::eval_packed(lv, nv, yield_constr);
         byte::eval_packed(lv, yield_constr);
+        shift::eval_packed_generic(lv, nv, yield_constr);
     }
 
     fn eval_ext_circuit(
@@ -237,6 +239,7 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for ArithmeticSta
         divmod::eval_ext_circuit(builder, lv, nv, yield_constr);
         modular::eval_ext_circuit(builder, lv, nv, yield_constr);
         byte::eval_ext_circuit(builder, lv, yield_constr);
+        shift::eval_ext_circuit(builder, lv, nv, yield_constr);
     }
 
     fn constraint_degree(&self) -> usize {
diff --git a/evm/src/arithmetic/divmod.rs b/evm/src/arithmetic/divmod.rs
index 258c131f32..17956b5246 100644
--- a/evm/src/arithmetic/divmod.rs
+++ b/evm/src/arithmetic/divmod.rs
@@ -45,7 +45,7 @@ pub(crate) fn generate<F: PrimeField64>(
     }
 
     match filter {
-        IS_DIV | IS_SHR => {
+        IS_DIV => {
             debug_assert!(
                 lv[OUTPUT_REGISTER]
                     .iter()
@@ -104,14 +104,11 @@ pub(crate) fn eval_packed<P: PackedField>(
     nv: &[P; NUM_ARITH_COLUMNS],
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    // Constrain IS_SHR independently, so that it doesn't impact the
-    // constraints when combining the flag with IS_DIV.
-    yield_constr.constraint_last_row(lv[IS_SHR]);
     eval_packed_divmod_helper(
         lv,
         nv,
         yield_constr,
-        lv[IS_DIV] + lv[IS_SHR],
+        lv[IS_DIV],
         OUTPUT_REGISTER,
         AUX_INPUT_REGISTER_0,
     );
@@ -164,8 +161,7 @@ pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     nv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    yield_constr.constraint_last_row(builder, lv[IS_SHR]);
-    let div_shr_flag = builder.add_extension(lv[IS_DIV], lv[IS_SHR]);
+    let div_shr_flag = lv[IS_DIV];
     eval_ext_circuit_divmod_helper(
         builder,
         lv,
@@ -214,8 +210,6 @@ mod tests {
         for op in MODULAR_OPS {
             lv[op] = F::ZERO;
         }
-        // Deactivate the SHR flag so that a DIV operation is not triggered.
-        lv[IS_SHR] = F::ZERO;
 
         let mut constraint_consumer = ConstraintConsumer::new(
             vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
@@ -247,7 +241,6 @@ mod tests {
                 for op in MODULAR_OPS {
                     lv[op] = F::ZERO;
                 }
-                lv[IS_SHR] = F::ZERO;
                 lv[op_filter] = F::ONE;
 
                 let input0 = U256::from(rng.gen::<[u8; 32]>());
@@ -308,7 +301,6 @@ mod tests {
                 for op in MODULAR_OPS {
                     lv[op] = F::ZERO;
                 }
-                lv[IS_SHR] = F::ZERO;
                 lv[op_filter] = F::ONE;
 
                 let input0 = U256::from(rng.gen::<[u8; 32]>());
diff --git a/evm/src/arithmetic/mod.rs b/evm/src/arithmetic/mod.rs
index bd6d56e8cb..fad2a6a92b 100644
--- a/evm/src/arithmetic/mod.rs
+++ b/evm/src/arithmetic/mod.rs
@@ -9,6 +9,7 @@ mod byte;
 mod divmod;
 mod modular;
 mod mul;
+mod shift;
 mod utils;
 
 pub mod arithmetic_stark;
@@ -35,15 +36,39 @@ impl BinaryOperator {
     pub(crate) fn result(&self, input0: U256, input1: U256) -> U256 {
         match self {
             BinaryOperator::Add => input0.overflowing_add(input1).0,
-            BinaryOperator::Mul | BinaryOperator::Shl => input0.overflowing_mul(input1).0,
+            BinaryOperator::Mul => input0.overflowing_mul(input1).0,
+            BinaryOperator::Shl => {
+                // Compute the shifted displacement, so we can turn the left
+                // left into a multiplication.
+                let shifted_input1 = if input1.bits() <= 32 {
+                    U256::one() << input1
+                } else {
+                    U256::zero()
+                };
+                input0.overflowing_mul(shifted_input1).0
+            }
             BinaryOperator::Sub => input0.overflowing_sub(input1).0,
-            BinaryOperator::Div | BinaryOperator::Shr => {
+            BinaryOperator::Div => {
                 if input1.is_zero() {
                     U256::zero()
                 } else {
                     input0 / input1
                 }
             }
+            BinaryOperator::Shr => {
+                // Compute the shifted displacement, so we can turn the
+                // right shift into a multiplication.
+                let shifted_input1 = if input1.bits() <= 32 {
+                    U256::one() << input1
+                } else {
+                    U256::zero()
+                };
+                if shifted_input1.is_zero() {
+                    U256::zero()
+                } else {
+                    input0 / shifted_input1
+                }
+            }
             BinaryOperator::Mod => {
                 if input1.is_zero() {
                     U256::zero()
@@ -238,15 +263,25 @@ fn binary_op_to_rows<F: PrimeField64>(
             addcy::generate(&mut row, op.row_filter(), input0, input1);
             (row, None)
         }
-        BinaryOperator::Mul | BinaryOperator::Shl => {
+        BinaryOperator::Mul => {
             mul::generate(&mut row, input0, input1);
             (row, None)
         }
-        BinaryOperator::Div | BinaryOperator::Mod | BinaryOperator::Shr => {
+        BinaryOperator::Shl => {
+            let mut nv = vec![F::ZERO; columns::NUM_ARITH_COLUMNS];
+            shift::generate(&mut row, &mut nv, true, input0, input1, result);
+            (row, None)
+        }
+        BinaryOperator::Div | BinaryOperator::Mod => {
             let mut nv = vec![F::ZERO; columns::NUM_ARITH_COLUMNS];
             divmod::generate(&mut row, &mut nv, op.row_filter(), input0, input1, result);
             (row, Some(nv))
         }
+        BinaryOperator::Shr => {
+            let mut nv = vec![F::ZERO; columns::NUM_ARITH_COLUMNS];
+            shift::generate(&mut row, &mut nv, false, input0, input1, result);
+            (row, Some(nv))
+        }
         BinaryOperator::AddFp254 | BinaryOperator::MulFp254 | BinaryOperator::SubFp254 => {
             ternary_op_to_rows::<F>(op.row_filter(), input0, input1, BN_BASE, result)
         }
diff --git a/evm/src/arithmetic/mul.rs b/evm/src/arithmetic/mul.rs
index efb4d82247..597d405192 100644
--- a/evm/src/arithmetic/mul.rs
+++ b/evm/src/arithmetic/mul.rs
@@ -121,7 +121,7 @@ pub fn eval_packed_generic<P: PackedField>(
 ) {
     let base = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
 
-    let is_mul = lv[IS_MUL] + lv[IS_SHL];
+    let is_mul = lv[IS_MUL];
     let input0_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_0);
     let input1_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
     let output_limbs = read_value::<N_LIMBS, _>(lv, OUTPUT_REGISTER);
@@ -173,7 +173,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let is_mul = builder.add_extension(lv[IS_MUL], lv[IS_SHL]);
+    let is_mul = lv[IS_MUL];
     let input0_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_0);
     let input1_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
     let output_limbs = read_value::<N_LIMBS, _>(lv, OUTPUT_REGISTER);
@@ -229,8 +229,6 @@ mod tests {
         // if `IS_MUL == 0`, then the constraints should be met even
         // if all values are garbage.
         lv[IS_MUL] = F::ZERO;
-        // Deactivate the SHL flag so that a MUL operation is not triggered.
-        lv[IS_SHL] = F::ZERO;
 
         let mut constraint_consumer = ConstraintConsumer::new(
             vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
diff --git a/evm/src/arithmetic/shift.rs b/evm/src/arithmetic/shift.rs
new file mode 100644
index 0000000000..260253f104
--- /dev/null
+++ b/evm/src/arithmetic/shift.rs
@@ -0,0 +1,394 @@
+//! Support for the EVM SHL and SHR instructions.
+//!
+//! This crate verifies an EVM shift instruction, which takes two
+//! 256-bit inputs S and A, and produces a 256-bit output C satisfying
+//!
+//!    C = A << S (mod 2^256) for SHL or
+//!    C = A >> S (mod 2^256) for SHR.
+//!
+//! The way this computation is carried is by providing a third input
+//!    B = 1 << S (mod 2^256)
+//! and then computing:
+//!    C = A * B (mod 2^256) for SHL or
+//!    C = A / B (mod 2^256) for SHR
+//!
+//! Inputs A, S, and B, and output C, are given as arrays of 16-bit
+//! limbs. For example, if the limbs of A are a[0]...a[15], then
+//!
+//!    A = \sum_{i=0}^15 a[i] β^i,
+//!
+//! where β = 2^16 = 2^LIMB_BITS. To verify that A, S, B and C satisfy
+//! the equations, we proceed similarly to MUL for SHL and to DIV for SHR.
+
+use ethereum_types::U256;
+use plonky2::field::extension::Extendable;
+use plonky2::field::packed::PackedField;
+use plonky2::field::types::{Field, PrimeField64};
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+use plonky2::plonk::circuit_builder::CircuitBuilder;
+
+use super::modular::modular_constr_poly_ext_circuit;
+use crate::arithmetic::columns::{self, *};
+use crate::arithmetic::modular::{generate_modular_op, modular_constr_poly};
+use crate::arithmetic::utils::*;
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+
+/// Generates a shift operation (either SHL or SHR).
+/// The inputs are stored in the form `(shift, input, 1 << shift)`.
+/// NB: if `shift > 2^32`, then the third register holds 0.
+pub fn generate<F: PrimeField64>(
+    lv: &mut [F],
+    nv: &mut [F],
+    is_shl: bool,
+    input: U256,
+    shift: U256,
+    result: U256,
+) {
+    // TODO: It would probably be clearer/cleaner to read the U256
+    // into an [i64;N] and then copy that to the lv table.
+    // The first input is the shift we need to apply.
+    u256_to_array(&mut lv[INPUT_REGISTER_0], shift);
+    // The second register holds the input which needs shifting.
+    u256_to_array(&mut lv[INPUT_REGISTER_1], input);
+    u256_to_array(&mut lv[OUTPUT_REGISTER], result);
+    // If `shift > 2^32`, the shifted displacement is set to 0.
+    // Compute 1 << shift and store it in the third input register.
+    let shifted_displacement = if shift > U256::from(255u64) {
+        U256::zero()
+    } else {
+        U256::one() << shift
+    };
+
+    u256_to_array(&mut lv[INPUT_REGISTER_2], shifted_displacement);
+
+    let input0 = read_value_i64_limbs(lv, INPUT_REGISTER_1); // input
+    let input1 = read_value_i64_limbs(lv, INPUT_REGISTER_2); // 1 << shift
+
+    if is_shl {
+        // If the operation is SHL, we compute `input * shifted_displacement`.
+        const MASK: i64 = (1i64 << LIMB_BITS) - 1i64;
+
+        // Input and output have 16-bit limbs
+        let mut output_limbs = [0i64; N_LIMBS];
+
+        // Column-wise pen-and-paper long multiplication on 16-bit limbs.
+        // First calculate the coefficients of a(x)*b(x) (in unreduced_prod),
+        // then do carry propagation to obtain C = c(β) = a(β)*b(β).
+        let mut cy = 0i64;
+        let mut unreduced_prod = pol_mul_lo(input0, input1);
+        for col in 0..N_LIMBS {
+            let t = unreduced_prod[col] + cy;
+            cy = t >> LIMB_BITS;
+            output_limbs[col] = t & MASK;
+        }
+        // In principle, the last cy could be dropped because this is
+        // multiplication modulo 2^256. However, we need it below for
+        // aux_limbs to handle the fact that unreduced_prod will
+        // inevitably contain one digit's worth that is > 2^256.
+
+        pol_sub_assign(&mut unreduced_prod, &output_limbs);
+
+        let mut aux_limbs = pol_remove_root_2exp::<LIMB_BITS, _, N_LIMBS>(unreduced_prod);
+        aux_limbs[N_LIMBS - 1] = -cy;
+
+        for c in aux_limbs.iter_mut() {
+            // we store the unsigned offset value c + 2^20
+            *c += AUX_COEFF_ABS_MAX;
+        }
+
+        debug_assert!(aux_limbs.iter().all(|&c| c.abs() <= 2 * AUX_COEFF_ABS_MAX));
+
+        lv[MUL_AUX_INPUT_LO].copy_from_slice(&aux_limbs.map(|c| F::from_canonical_u16(c as u16)));
+        lv[MUL_AUX_INPUT_HI]
+            .copy_from_slice(&aux_limbs.map(|c| F::from_canonical_u16((c >> 16) as u16)));
+    } else {
+        // If the operation is SHR, we compute: `input / shifted_displacement` if `shifted_displacement == 0`
+        // otherwise, the output is 0.
+        let input_limbs = read_value_i64_limbs::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
+        let pol_input = pol_extend(input_limbs);
+        let (out, quo_input) =
+            generate_modular_op(lv, nv, columns::IS_SHL, pol_input, INPUT_REGISTER_2);
+        debug_assert!(
+            &quo_input[N_LIMBS..].iter().all(|&x| x == F::ZERO),
+            "expected top half of quo_input to be zero"
+        );
+
+        // Initialise whole (double) register to zero; the low half will
+        // be overwritten via lv[AUX_INPUT_REGISTER] below.
+        for i in MODULAR_QUO_INPUT {
+            lv[i] = F::ZERO;
+        }
+
+        lv[AUX_INPUT_REGISTER_0].copy_from_slice(&out);
+    }
+}
+
+/// Evaluates the constraints for an SHL opcode.
+/// The logic is very similar to the one for MUL. The only difference is that
+/// the inputs are in `INPUT_REGISTER_1`  and `INPUT_REGISTER_2` instead of
+/// `INPUT_REGISTER_0` and `INPUT_REGISTER_1`.
+pub fn eval_packed_shl<P: PackedField>(
+    lv: &[P; NUM_ARITH_COLUMNS],
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    let base = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
+
+    let is_shl = lv[IS_SHL];
+    let input0_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
+    let shifted_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_2);
+    let output_limbs = read_value::<N_LIMBS, _>(lv, OUTPUT_REGISTER);
+
+    let aux_limbs = {
+        // MUL_AUX_INPUT was offset by 2^20 in generation, so we undo
+        // that here
+        let offset = P::Scalar::from_canonical_u64(AUX_COEFF_ABS_MAX as u64);
+        let mut aux_limbs = read_value::<N_LIMBS, _>(lv, MUL_AUX_INPUT_LO);
+        let aux_limbs_hi = &lv[MUL_AUX_INPUT_HI];
+        for (lo, &hi) in aux_limbs.iter_mut().zip(aux_limbs_hi) {
+            *lo += hi * base - offset;
+        }
+        aux_limbs
+    };
+
+    // Constraint poly holds the coefficients of the polynomial that
+    // must be identically zero for this multiplication to be
+    // verified.
+    //
+    // These two lines set constr_poly to the polynomial a(x)b(x) - c(x),
+    // where a, b and c are the polynomials
+    //
+    //   a(x) = \sum_i input0_limbs[i] * x^i
+    //   b(x) = \sum_i input1_limbs[i] * x^i
+    //   c(x) = \sum_i output_limbs[i] * x^i
+    //
+    // This polynomial should equal (x - β)*s(x) where s is
+    //
+    //   s(x) = \sum_i aux_limbs[i] * x^i
+    //
+    let mut constr_poly = pol_mul_lo(input0_limbs, shifted_limbs);
+    pol_sub_assign(&mut constr_poly, &output_limbs);
+
+    // This subtracts (x - β) * s(x) from constr_poly.
+    pol_sub_assign(&mut constr_poly, &pol_adjoin_root(aux_limbs, base));
+
+    // At this point constr_poly holds the coefficients of the
+    // polynomial a(x)b(x) - c(x) - (x - β)*s(x). The
+    // multiplication is valid if and only if all of those
+    // coefficients are zero.
+    for &c in &constr_poly {
+        yield_constr.constraint(is_shl * c);
+    }
+}
+
+/// Evaluates the constraints for an SHR opcode.
+/// The logic is very similar to the one for DIV. The only difference is that
+/// the inputs are in `INPUT_REGISTER_1`  and `INPUT_REGISTER_2` instead of
+/// `INPUT_REGISTER_0` and `INPUT_REGISTER_1`.
+fn eval_packed_shr<P: PackedField>(
+    lv: &[P; NUM_ARITH_COLUMNS],
+    nv: &[P; NUM_ARITH_COLUMNS],
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    let quo_range = OUTPUT_REGISTER;
+    let rem_range = AUX_INPUT_REGISTER_0;
+    let filter = lv[IS_SHR];
+    debug_assert!(quo_range.len() == N_LIMBS);
+    debug_assert!(rem_range.len() == N_LIMBS);
+
+    yield_constr.constraint_last_row(filter);
+
+    let num = &lv[INPUT_REGISTER_1];
+    let den = read_value(lv, INPUT_REGISTER_2);
+    let quo = {
+        let mut quo = [P::ZEROS; 2 * N_LIMBS];
+        quo[..N_LIMBS].copy_from_slice(&lv[quo_range]);
+        quo
+    };
+    let rem = read_value(lv, rem_range);
+
+    let mut constr_poly = modular_constr_poly(lv, nv, yield_constr, filter, rem, den, quo);
+
+    let input = num;
+    pol_sub_assign(&mut constr_poly, input);
+
+    for &c in constr_poly.iter() {
+        yield_constr.constraint_transition(filter * c);
+    }
+}
+
+pub fn eval_packed_generic<P: PackedField>(
+    lv: &[P; NUM_ARITH_COLUMNS],
+    nv: &[P; NUM_ARITH_COLUMNS],
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    eval_packed_shl(lv, yield_constr);
+    eval_packed_shr(lv, nv, yield_constr);
+}
+
+pub fn eval_ext_circuit_shl<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    let is_shl = lv[IS_SHL];
+    let input0_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
+    let shifted_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_2);
+
+    let output_limbs = read_value::<N_LIMBS, _>(lv, OUTPUT_REGISTER);
+
+    let aux_limbs = {
+        let base = builder.constant_extension(F::Extension::from_canonical_u64(1 << LIMB_BITS));
+        let offset =
+            builder.constant_extension(F::Extension::from_canonical_u64(AUX_COEFF_ABS_MAX as u64));
+        let mut aux_limbs = read_value::<N_LIMBS, _>(lv, MUL_AUX_INPUT_LO);
+        let aux_limbs_hi = &lv[MUL_AUX_INPUT_HI];
+        for (lo, &hi) in aux_limbs.iter_mut().zip(aux_limbs_hi) {
+            //*lo = lo + hi * base - offset;
+            let t = builder.mul_sub_extension(hi, base, offset);
+            *lo = builder.add_extension(*lo, t);
+        }
+        aux_limbs
+    };
+
+    let mut constr_poly = pol_mul_lo_ext_circuit(builder, input0_limbs, shifted_limbs);
+    pol_sub_assign_ext_circuit(builder, &mut constr_poly, &output_limbs);
+
+    let base = builder.constant_extension(F::Extension::from_canonical_u64(1 << LIMB_BITS));
+    let rhs = pol_adjoin_root_ext_circuit(builder, aux_limbs, base);
+    pol_sub_assign_ext_circuit(builder, &mut constr_poly, &rhs);
+
+    for &c in &constr_poly {
+        let filter = builder.mul_extension(is_shl, c);
+        yield_constr.constraint(builder, filter);
+    }
+}
+
+pub fn eval_ext_circuit_shr<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
+    nv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    let filter = lv[IS_SHR];
+    yield_constr.constraint_last_row(builder, filter);
+
+    let quo_range = OUTPUT_REGISTER;
+    let rem_range = AUX_INPUT_REGISTER_0;
+    let num = &lv[INPUT_REGISTER_1];
+    let den = read_value(lv, INPUT_REGISTER_2);
+    let quo = {
+        let zero = builder.zero_extension();
+        let mut quo = [zero; 2 * N_LIMBS];
+        quo[..N_LIMBS].copy_from_slice(&lv[quo_range]);
+        quo
+    };
+    let rem = read_value(lv, rem_range);
+
+    let mut constr_poly =
+        modular_constr_poly_ext_circuit(lv, nv, builder, yield_constr, filter, rem, den, quo);
+
+    let input = num;
+    pol_sub_assign_ext_circuit(builder, &mut constr_poly, input);
+
+    for &c in constr_poly.iter() {
+        let t = builder.mul_extension(filter, c);
+        yield_constr.constraint_transition(builder, t);
+    }
+}
+
+pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
+    nv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    eval_ext_circuit_shl(builder, lv, yield_constr);
+    eval_ext_circuit_shr(builder, lv, nv, yield_constr);
+}
+
+#[cfg(test)]
+mod tests {
+    use plonky2::field::goldilocks_field::GoldilocksField;
+    use plonky2::field::types::{Field, Sample};
+    use rand::{Rng, SeedableRng};
+    use rand_chacha::ChaCha8Rng;
+
+    use super::*;
+    use crate::arithmetic::columns::NUM_ARITH_COLUMNS;
+    use crate::constraint_consumer::ConstraintConsumer;
+
+    const N_RND_TESTS: usize = 1000;
+
+    // TODO: Should be able to refactor this test to apply to all operations.
+    #[test]
+    fn generate_eval_consistency_not_shl() {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
+        let nv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
+
+        // if `IS_SHL == 0` and ÌS_SHR == 0`, then the constraints should be met even
+        // if all values are garbage.
+        lv[IS_SHL] = F::ZERO;
+        lv[IS_SHR] = F::ZERO;
+
+        let mut constraint_consumer = ConstraintConsumer::new(
+            vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+            GoldilocksField::ONE,
+        );
+        eval_packed_generic(&lv, &nv, &mut constraint_consumer);
+        for &acc in &constraint_consumer.constraint_accs {
+            assert_eq!(acc, GoldilocksField::ZERO);
+        }
+    }
+
+    #[test]
+    fn generate_eval_consistency_shl() {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
+        let mut nv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
+
+        // set `IS_MUL == 1` and ensure all constraints are satisfied.
+        lv[IS_SHL] = F::ONE;
+        lv[IS_SHR] = F::ZERO;
+
+        for _i in 0..N_RND_TESTS {
+            let shift = U256::from(rng.gen::<usize>());
+            let shifted = if shift > U256::from(255) {
+                U256::zero()
+            } else {
+                U256::one() << shift
+            };
+            u256_to_array(&mut lv[INPUT_REGISTER_0], shift);
+            u256_to_array(&mut lv[INPUT_REGISTER_2], shifted);
+            let mut full_input = U256::from(0);
+            // set inputs to random values
+            for ai in INPUT_REGISTER_1 {
+                lv[ai] = F::from_canonical_u16(rng.gen());
+                full_input =
+                    U256::from(lv[ai].to_canonical_u64()) + full_input * U256::from(1 << 16);
+            }
+
+            let output = full_input * shifted;
+
+            generate(&mut lv, &mut nv, true, full_input, shift, output);
+
+            let mut constraint_consumer = ConstraintConsumer::new(
+                vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+                GoldilocksField::ONE,
+                GoldilocksField::ONE,
+                GoldilocksField::ONE,
+            );
+            eval_packed_generic(&lv, &nv, &mut constraint_consumer);
+            for &acc in &constraint_consumer.constraint_accs {
+                assert_eq!(acc, GoldilocksField::ZERO);
+            }
+        }
+    }
+}
diff --git a/evm/src/cpu/cpu_stark.rs b/evm/src/cpu/cpu_stark.rs
index f23ff308b6..82ca5452b7 100644
--- a/evm/src/cpu/cpu_stark.rs
+++ b/evm/src/cpu/cpu_stark.rs
@@ -63,19 +63,10 @@ fn ctl_data_binops<F: Field>() -> Vec<Column<F>> {
 /// one output of a ternary operation. By default, ternary operations use
 /// the first three memory channels, and the last one for the result (binary
 /// operations do not use the third inputs).
-///
-/// Shift operations are different, as they are simulated with `MUL` or `DIV`
-/// on the arithmetic side. We first convert the shift into the multiplicand
-/// (in case of `SHL`) or the divisor (in case of `SHR`), making the first memory
-/// channel not directly usable. We overcome this by adding an offset of 1 in
-/// case of shift operations, which will skip the first memory channel and use the
-/// next three as ternary inputs. Because both `MUL` and `DIV` are binary operations,
-/// the last memory channel used for the inputs will be safely ignored.
-fn ctl_data_ternops<F: Field>(is_shift: bool) -> Vec<Column<F>> {
-    let offset = is_shift as usize;
-    let mut res = Column::singles(COL_MAP.mem_channels[offset].value).collect_vec();
-    res.extend(Column::singles(COL_MAP.mem_channels[offset + 1].value));
-    res.extend(Column::singles(COL_MAP.mem_channels[offset + 2].value));
+fn ctl_data_ternops<F: Field>() -> Vec<Column<F>> {
+    let mut res = Column::singles(COL_MAP.mem_channels[0].value).collect_vec();
+    res.extend(Column::singles(COL_MAP.mem_channels[1].value));
+    res.extend(Column::singles(COL_MAP.mem_channels[2].value));
     res.extend(Column::singles(
         COL_MAP.mem_channels[NUM_GP_CHANNELS - 1].value,
     ));
@@ -96,7 +87,7 @@ pub fn ctl_filter_logic<F: Field>() -> Column<F> {
 pub fn ctl_arithmetic_base_rows<F: Field>() -> TableWithColumns<F> {
     // Instead of taking single columns, we reconstruct the entire opcode value directly.
     let mut columns = vec![Column::le_bits(COL_MAP.opcode_bits)];
-    columns.extend(ctl_data_ternops(false));
+    columns.extend(ctl_data_ternops());
     // Create the CPU Table whose columns are those with the three
     // inputs and one output of the ternary operations listed in `ops`
     // (also `ops` is used as the operation filter). The list of
@@ -109,22 +100,11 @@ pub fn ctl_arithmetic_base_rows<F: Field>() -> TableWithColumns<F> {
             COL_MAP.op.binary_op,
             COL_MAP.op.fp254_op,
             COL_MAP.op.ternary_op,
+            COL_MAP.op.shift,
         ])),
     )
 }
 
-pub fn ctl_arithmetic_shift_rows<F: Field>() -> TableWithColumns<F> {
-    // Instead of taking single columns, we reconstruct the entire opcode value directly.
-    let mut columns = vec![Column::le_bits(COL_MAP.opcode_bits)];
-    columns.extend(ctl_data_ternops(true));
-    // Create the CPU Table whose columns are those with the three
-    // inputs and one output of the ternary operations listed in `ops`
-    // (also `ops` is used as the operation filter). The list of
-    // operations includes binary operations which will simply ignore
-    // the third input.
-    TableWithColumns::new(Table::Cpu, columns, Some(Column::single(COL_MAP.op.shift)))
-}
-
 pub fn ctl_data_byte_packing<F: Field>() -> Vec<Column<F>> {
     ctl_data_keccak_sponge()
 }
diff --git a/evm/src/witness/operation.rs b/evm/src/witness/operation.rs
index 0620069f00..7e5ed02708 100644
--- a/evm/src/witness/operation.rs
+++ b/evm/src/witness/operation.rs
@@ -499,12 +499,6 @@ fn append_shift<F: Field>(
         channel.addr_virtual = F::from_canonical_usize(lookup_addr.virt);
     }
 
-    // Convert the shift, and log the corresponding arithmetic operation.
-    let input0 = if input0 > U256::from(255u64) {
-        U256::zero()
-    } else {
-        U256::one() << input0
-    };
     let operator = if is_shl {
         BinaryOperator::Shl
     } else {

From 0f6404a5ccc8ee7f54f217176ebf64b3806f5038 Mon Sep 17 00:00:00 2001
From: Linda Guiga <lindaguiga3@gmail.com>
Date: Wed, 4 Oct 2023 13:15:38 -0400
Subject: [PATCH 2/4] Change order of inputs for the arithmetic shift
 operations. Add SHR test. Fix max number of bit shifts. Cleanup.

---
 evm/src/arithmetic/divmod.rs |  67 ++++++---
 evm/src/arithmetic/mod.rs    |  20 +--
 evm/src/arithmetic/mul.rs    |  85 +++++++----
 evm/src/arithmetic/shift.rs  | 267 ++++++++++-------------------------
 evm/src/witness/operation.rs |   2 +-
 5 files changed, 186 insertions(+), 255 deletions(-)

diff --git a/evm/src/arithmetic/divmod.rs b/evm/src/arithmetic/divmod.rs
index 17956b5246..bd93eea430 100644
--- a/evm/src/arithmetic/divmod.rs
+++ b/evm/src/arithmetic/divmod.rs
@@ -15,24 +15,19 @@ use crate::arithmetic::modular::{
 use crate::arithmetic::utils::*;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 
-/// Generate the output and auxiliary values for modular operations.
-pub(crate) fn generate<F: PrimeField64>(
+/// Generates the output and auxiliary values for modular operations,
+/// assuming the input, modular and output limbs are already set.
+pub(crate) fn generate_divmod<F: PrimeField64>(
     lv: &mut [F],
     nv: &mut [F],
     filter: usize,
-    input0: U256,
-    input1: U256,
-    result: U256,
+    input_limbs_range: Range<usize>,
+    modulus_range: Range<usize>,
 ) {
-    debug_assert!(lv.len() == NUM_ARITH_COLUMNS);
-
-    u256_to_array(&mut lv[INPUT_REGISTER_0], input0);
-    u256_to_array(&mut lv[INPUT_REGISTER_1], input1);
-    u256_to_array(&mut lv[OUTPUT_REGISTER], result);
-
-    let input_limbs = read_value_i64_limbs::<N_LIMBS, _>(lv, INPUT_REGISTER_0);
+    let input_limbs = read_value_i64_limbs::<N_LIMBS, _>(lv, input_limbs_range);
     let pol_input = pol_extend(input_limbs);
-    let (out, quo_input) = generate_modular_op(lv, nv, filter, pol_input, INPUT_REGISTER_1);
+    let (out, quo_input) = generate_modular_op(lv, nv, filter, pol_input, modulus_range);
+
     debug_assert!(
         &quo_input[N_LIMBS..].iter().all(|&x| x == F::ZERO),
         "expected top half of quo_input to be zero"
@@ -45,7 +40,7 @@ pub(crate) fn generate<F: PrimeField64>(
     }
 
     match filter {
-        IS_DIV => {
+        IS_DIV | IS_SHR => {
             debug_assert!(
                 lv[OUTPUT_REGISTER]
                     .iter()
@@ -65,13 +60,32 @@ pub(crate) fn generate<F: PrimeField64>(
         _ => panic!("expected filter to be IS_DIV or IS_MOD but it was {filter}"),
     };
 }
+/// Generate the output and auxiliary values for modular operations.
+pub(crate) fn generate<F: PrimeField64>(
+    lv: &mut [F],
+    nv: &mut [F],
+    filter: usize,
+    input0: U256,
+    input1: U256,
+    result: U256,
+) {
+    debug_assert!(lv.len() == NUM_ARITH_COLUMNS);
+
+    u256_to_array(&mut lv[INPUT_REGISTER_0], input0);
+    u256_to_array(&mut lv[INPUT_REGISTER_1], input1);
+    u256_to_array(&mut lv[OUTPUT_REGISTER], result);
+
+    generate_divmod(lv, nv, filter, INPUT_REGISTER_0, INPUT_REGISTER_1);
+}
 
 /// Verify that num = quo * den + rem and 0 <= rem < den.
-fn eval_packed_divmod_helper<P: PackedField>(
+pub fn eval_packed_divmod_helper<P: PackedField>(
     lv: &[P; NUM_ARITH_COLUMNS],
     nv: &[P; NUM_ARITH_COLUMNS],
     yield_constr: &mut ConstraintConsumer<P>,
     filter: P,
+    num_range: Range<usize>,
+    den_range: Range<usize>,
     quo_range: Range<usize>,
     rem_range: Range<usize>,
 ) {
@@ -80,8 +94,8 @@ fn eval_packed_divmod_helper<P: PackedField>(
 
     yield_constr.constraint_last_row(filter);
 
-    let num = &lv[INPUT_REGISTER_0];
-    let den = read_value(lv, INPUT_REGISTER_1);
+    let num = &lv[num_range];
+    let den = read_value(lv, den_range);
     let quo = {
         let mut quo = [P::ZEROS; 2 * N_LIMBS];
         quo[..N_LIMBS].copy_from_slice(&lv[quo_range]);
@@ -109,6 +123,8 @@ pub(crate) fn eval_packed<P: PackedField>(
         nv,
         yield_constr,
         lv[IS_DIV],
+        INPUT_REGISTER_0,
+        INPUT_REGISTER_1,
         OUTPUT_REGISTER,
         AUX_INPUT_REGISTER_0,
     );
@@ -117,24 +133,28 @@ pub(crate) fn eval_packed<P: PackedField>(
         nv,
         yield_constr,
         lv[IS_MOD],
+        INPUT_REGISTER_0,
+        INPUT_REGISTER_1,
         AUX_INPUT_REGISTER_0,
         OUTPUT_REGISTER,
     );
 }
 
-fn eval_ext_circuit_divmod_helper<F: RichField + Extendable<D>, const D: usize>(
+pub(crate) fn eval_ext_circuit_divmod_helper<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     nv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
     filter: ExtensionTarget<D>,
+    num_range: Range<usize>,
+    den_range: Range<usize>,
     quo_range: Range<usize>,
     rem_range: Range<usize>,
 ) {
     yield_constr.constraint_last_row(builder, filter);
 
-    let num = &lv[INPUT_REGISTER_0];
-    let den = read_value(lv, INPUT_REGISTER_1);
+    let num = &lv[num_range];
+    let den = read_value(lv, den_range);
     let quo = {
         let zero = builder.zero_extension();
         let mut quo = [zero; 2 * N_LIMBS];
@@ -161,13 +181,14 @@ pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     nv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let div_shr_flag = lv[IS_DIV];
     eval_ext_circuit_divmod_helper(
         builder,
         lv,
         nv,
         yield_constr,
-        div_shr_flag,
+        lv[IS_DIV],
+        INPUT_REGISTER_0,
+        INPUT_REGISTER_1,
         OUTPUT_REGISTER,
         AUX_INPUT_REGISTER_0,
     );
@@ -177,6 +198,8 @@ pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         nv,
         yield_constr,
         lv[IS_MOD],
+        INPUT_REGISTER_0,
+        INPUT_REGISTER_1,
         AUX_INPUT_REGISTER_0,
         OUTPUT_REGISTER,
     );
diff --git a/evm/src/arithmetic/mod.rs b/evm/src/arithmetic/mod.rs
index fad2a6a92b..7763e98a06 100644
--- a/evm/src/arithmetic/mod.rs
+++ b/evm/src/arithmetic/mod.rs
@@ -38,14 +38,11 @@ impl BinaryOperator {
             BinaryOperator::Add => input0.overflowing_add(input1).0,
             BinaryOperator::Mul => input0.overflowing_mul(input1).0,
             BinaryOperator::Shl => {
-                // Compute the shifted displacement, so we can turn the left
-                // left into a multiplication.
-                let shifted_input1 = if input1.bits() <= 32 {
-                    U256::one() << input1
+                if input0 < U256::from(256usize) {
+                    input1 << input0
                 } else {
                     U256::zero()
-                };
-                input0.overflowing_mul(shifted_input1).0
+                }
             }
             BinaryOperator::Sub => input0.overflowing_sub(input1).0,
             BinaryOperator::Div => {
@@ -56,17 +53,10 @@ impl BinaryOperator {
                 }
             }
             BinaryOperator::Shr => {
-                // Compute the shifted displacement, so we can turn the
-                // right shift into a multiplication.
-                let shifted_input1 = if input1.bits() <= 32 {
-                    U256::one() << input1
+                if input0 < U256::from(256usize) {
+                    input1 >> input0
                 } else {
                     U256::zero()
-                };
-                if shifted_input1.is_zero() {
-                    U256::zero()
-                } else {
-                    input0 / shifted_input1
                 }
             }
             BinaryOperator::Mod => {
diff --git a/evm/src/arithmetic/mul.rs b/evm/src/arithmetic/mul.rs
index 597d405192..bec9a0e9ce 100644
--- a/evm/src/arithmetic/mul.rs
+++ b/evm/src/arithmetic/mul.rs
@@ -67,16 +67,8 @@ use crate::arithmetic::columns::*;
 use crate::arithmetic::utils::*;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 
-pub fn generate<F: PrimeField64>(lv: &mut [F], left_in: U256, right_in: U256) {
-    // TODO: It would probably be clearer/cleaner to read the U256
-    // into an [i64;N] and then copy that to the lv table.
-    u256_to_array(&mut lv[INPUT_REGISTER_0], left_in);
-    u256_to_array(&mut lv[INPUT_REGISTER_1], right_in);
-    u256_to_array(&mut lv[INPUT_REGISTER_2], U256::zero());
-
-    let input0 = read_value_i64_limbs(lv, INPUT_REGISTER_0);
-    let input1 = read_value_i64_limbs(lv, INPUT_REGISTER_1);
-
+/// Given the two limbs of `left_in` and `right_in`, computes `left_in * right_in`.
+pub fn generate_mul<F: PrimeField64>(lv: &mut [F], left_in: [i64; 16], right_in: [i64; 16]) {
     const MASK: i64 = (1i64 << LIMB_BITS) - 1i64;
 
     // Input and output have 16-bit limbs
@@ -86,7 +78,7 @@ pub fn generate<F: PrimeField64>(lv: &mut [F], left_in: U256, right_in: U256) {
     // First calculate the coefficients of a(x)*b(x) (in unreduced_prod),
     // then do carry propagation to obtain C = c(β) = a(β)*b(β).
     let mut cy = 0i64;
-    let mut unreduced_prod = pol_mul_lo(input0, input1);
+    let mut unreduced_prod = pol_mul_lo(left_in, right_in);
     for col in 0..N_LIMBS {
         let t = unreduced_prod[col] + cy;
         cy = t >> LIMB_BITS;
@@ -115,17 +107,30 @@ pub fn generate<F: PrimeField64>(lv: &mut [F], left_in: U256, right_in: U256) {
         .copy_from_slice(&aux_limbs.map(|c| F::from_canonical_u16((c >> 16) as u16)));
 }
 
-pub fn eval_packed_generic<P: PackedField>(
+pub fn generate<F: PrimeField64>(lv: &mut [F], left_in: U256, right_in: U256) {
+    // TODO: It would probably be clearer/cleaner to read the U256
+    // into an [i64;N] and then copy that to the lv table.
+    u256_to_array(&mut lv[INPUT_REGISTER_0], left_in);
+    u256_to_array(&mut lv[INPUT_REGISTER_1], right_in);
+    u256_to_array(&mut lv[INPUT_REGISTER_2], U256::zero());
+
+    let input0 = read_value_i64_limbs(lv, INPUT_REGISTER_0);
+    let input1 = read_value_i64_limbs(lv, INPUT_REGISTER_1);
+
+    generate_mul(lv, input0, input1);
+}
+
+pub fn eval_packed_generic_mul<P: PackedField>(
     lv: &[P; NUM_ARITH_COLUMNS],
+    filter: P,
+    left_in_limbs: [P; 16],
+    right_in_limbs: [P; 16],
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    let base = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
-
-    let is_mul = lv[IS_MUL];
-    let input0_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_0);
-    let input1_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
     let output_limbs = read_value::<N_LIMBS, _>(lv, OUTPUT_REGISTER);
 
+    let base = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
+
     let aux_limbs = {
         // MUL_AUX_INPUT was offset by 2^20 in generation, so we undo
         // that here
@@ -153,7 +158,7 @@ pub fn eval_packed_generic<P: PackedField>(
     //
     //   s(x) = \sum_i aux_limbs[i] * x^i
     //
-    let mut constr_poly = pol_mul_lo(input0_limbs, input1_limbs);
+    let mut constr_poly = pol_mul_lo(left_in_limbs, right_in_limbs);
     pol_sub_assign(&mut constr_poly, &output_limbs);
 
     // This subtracts (x - β) * s(x) from constr_poly.
@@ -164,18 +169,29 @@ pub fn eval_packed_generic<P: PackedField>(
     // multiplication is valid if and only if all of those
     // coefficients are zero.
     for &c in &constr_poly {
-        yield_constr.constraint(is_mul * c);
+        yield_constr.constraint(filter * c);
     }
 }
 
-pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
-    builder: &mut CircuitBuilder<F, D>,
-    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
-    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+pub fn eval_packed_generic<P: PackedField>(
+    lv: &[P; NUM_ARITH_COLUMNS],
+    yield_constr: &mut ConstraintConsumer<P>,
 ) {
     let is_mul = lv[IS_MUL];
     let input0_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_0);
     let input1_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
+
+    eval_packed_generic_mul(lv, is_mul, input0_limbs, input1_limbs, yield_constr);
+}
+
+pub fn eval_ext_mul_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
+    filter: ExtensionTarget<D>,
+    left_in_limbs: [ExtensionTarget<D>; 16],
+    right_in_limbs: [ExtensionTarget<D>; 16],
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
     let output_limbs = read_value::<N_LIMBS, _>(lv, OUTPUT_REGISTER);
 
     let aux_limbs = {
@@ -192,7 +208,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         aux_limbs
     };
 
-    let mut constr_poly = pol_mul_lo_ext_circuit(builder, input0_limbs, input1_limbs);
+    let mut constr_poly = pol_mul_lo_ext_circuit(builder, left_in_limbs, right_in_limbs);
     pol_sub_assign_ext_circuit(builder, &mut constr_poly, &output_limbs);
 
     let base = builder.constant_extension(F::Extension::from_canonical_u64(1 << LIMB_BITS));
@@ -200,11 +216,30 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     pol_sub_assign_ext_circuit(builder, &mut constr_poly, &rhs);
 
     for &c in &constr_poly {
-        let filter = builder.mul_extension(is_mul, c);
+        let filter = builder.mul_extension(filter, c);
         yield_constr.constraint(builder, filter);
     }
 }
 
+pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut CircuitBuilder<F, D>,
+    lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    let is_mul = lv[IS_MUL];
+    let input0_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_0);
+    let input1_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
+
+    eval_ext_mul_circuit(
+        builder,
+        lv,
+        is_mul,
+        input0_limbs,
+        input1_limbs,
+        yield_constr,
+    );
+}
+
 #[cfg(test)]
 mod tests {
     use plonky2::field::goldilocks_field::GoldilocksField;
diff --git a/evm/src/arithmetic/shift.rs b/evm/src/arithmetic/shift.rs
index 260253f104..297100d1fb 100644
--- a/evm/src/arithmetic/shift.rs
+++ b/evm/src/arithmetic/shift.rs
@@ -23,28 +23,30 @@
 use ethereum_types::U256;
 use plonky2::field::extension::Extendable;
 use plonky2::field::packed::PackedField;
-use plonky2::field::types::{Field, PrimeField64};
+use plonky2::field::types::PrimeField64;
 use plonky2::hash::hash_types::RichField;
 use plonky2::iop::ext_target::ExtensionTarget;
 use plonky2::plonk::circuit_builder::CircuitBuilder;
 
-use super::modular::modular_constr_poly_ext_circuit;
-use crate::arithmetic::columns::{self, *};
-use crate::arithmetic::modular::{generate_modular_op, modular_constr_poly};
+use super::{divmod, mul};
+use crate::arithmetic::columns::*;
 use crate::arithmetic::utils::*;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 
 /// Generates a shift operation (either SHL or SHR).
 /// The inputs are stored in the form `(shift, input, 1 << shift)`.
-/// NB: if `shift > 2^32`, then the third register holds 0.
+/// NB: if `shift >= 256`, then the third register holds 0.
+/// We leverage the functions in mul.rs and divmod.rs to carry out
+/// the computation.
 pub fn generate<F: PrimeField64>(
     lv: &mut [F],
     nv: &mut [F],
     is_shl: bool,
-    input: U256,
     shift: U256,
+    input: U256,
     result: U256,
 ) {
+    // We use the multiplication logic to generate SHL
     // TODO: It would probably be clearer/cleaner to read the U256
     // into an [i64;N] and then copy that to the lv table.
     // The first input is the shift we need to apply.
@@ -52,7 +54,7 @@ pub fn generate<F: PrimeField64>(
     // The second register holds the input which needs shifting.
     u256_to_array(&mut lv[INPUT_REGISTER_1], input);
     u256_to_array(&mut lv[OUTPUT_REGISTER], result);
-    // If `shift > 2^32`, the shifted displacement is set to 0.
+    // If `shift >= 256`, the shifted displacement is set to 0.
     // Compute 1 << shift and store it in the third input register.
     let shifted_displacement = if shift > U256::from(255u64) {
         U256::zero()
@@ -66,123 +68,32 @@ pub fn generate<F: PrimeField64>(
     let input1 = read_value_i64_limbs(lv, INPUT_REGISTER_2); // 1 << shift
 
     if is_shl {
-        // If the operation is SHL, we compute `input * shifted_displacement`.
-        const MASK: i64 = (1i64 << LIMB_BITS) - 1i64;
-
-        // Input and output have 16-bit limbs
-        let mut output_limbs = [0i64; N_LIMBS];
-
-        // Column-wise pen-and-paper long multiplication on 16-bit limbs.
-        // First calculate the coefficients of a(x)*b(x) (in unreduced_prod),
-        // then do carry propagation to obtain C = c(β) = a(β)*b(β).
-        let mut cy = 0i64;
-        let mut unreduced_prod = pol_mul_lo(input0, input1);
-        for col in 0..N_LIMBS {
-            let t = unreduced_prod[col] + cy;
-            cy = t >> LIMB_BITS;
-            output_limbs[col] = t & MASK;
-        }
-        // In principle, the last cy could be dropped because this is
-        // multiplication modulo 2^256. However, we need it below for
-        // aux_limbs to handle the fact that unreduced_prod will
-        // inevitably contain one digit's worth that is > 2^256.
-
-        pol_sub_assign(&mut unreduced_prod, &output_limbs);
-
-        let mut aux_limbs = pol_remove_root_2exp::<LIMB_BITS, _, N_LIMBS>(unreduced_prod);
-        aux_limbs[N_LIMBS - 1] = -cy;
-
-        for c in aux_limbs.iter_mut() {
-            // we store the unsigned offset value c + 2^20
-            *c += AUX_COEFF_ABS_MAX;
-        }
-
-        debug_assert!(aux_limbs.iter().all(|&c| c.abs() <= 2 * AUX_COEFF_ABS_MAX));
-
-        lv[MUL_AUX_INPUT_LO].copy_from_slice(&aux_limbs.map(|c| F::from_canonical_u16(c as u16)));
-        lv[MUL_AUX_INPUT_HI]
-            .copy_from_slice(&aux_limbs.map(|c| F::from_canonical_u16((c >> 16) as u16)));
+        // We generate the multiplication input0 * input1 using mul.rs.
+        mul::generate_mul(lv, input0, input1);
     } else {
         // If the operation is SHR, we compute: `input / shifted_displacement` if `shifted_displacement == 0`
-        // otherwise, the output is 0.
-        let input_limbs = read_value_i64_limbs::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
-        let pol_input = pol_extend(input_limbs);
-        let (out, quo_input) =
-            generate_modular_op(lv, nv, columns::IS_SHL, pol_input, INPUT_REGISTER_2);
-        debug_assert!(
-            &quo_input[N_LIMBS..].iter().all(|&x| x == F::ZERO),
-            "expected top half of quo_input to be zero"
-        );
-
-        // Initialise whole (double) register to zero; the low half will
-        // be overwritten via lv[AUX_INPUT_REGISTER] below.
-        for i in MODULAR_QUO_INPUT {
-            lv[i] = F::ZERO;
-        }
-
-        lv[AUX_INPUT_REGISTER_0].copy_from_slice(&out);
+        // otherwise, the output is 0. We use the logic in divmod.rs to achieve that.
+        divmod::generate_divmod(lv, nv, IS_SHR, INPUT_REGISTER_1, INPUT_REGISTER_2);
     }
 }
 
 /// Evaluates the constraints for an SHL opcode.
-/// The logic is very similar to the one for MUL. The only difference is that
+/// The logic is the same as the one for MUL. The only difference is that
 /// the inputs are in `INPUT_REGISTER_1`  and `INPUT_REGISTER_2` instead of
 /// `INPUT_REGISTER_0` and `INPUT_REGISTER_1`.
 pub fn eval_packed_shl<P: PackedField>(
     lv: &[P; NUM_ARITH_COLUMNS],
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    let base = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
-
     let is_shl = lv[IS_SHL];
     let input0_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
     let shifted_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_2);
-    let output_limbs = read_value::<N_LIMBS, _>(lv, OUTPUT_REGISTER);
-
-    let aux_limbs = {
-        // MUL_AUX_INPUT was offset by 2^20 in generation, so we undo
-        // that here
-        let offset = P::Scalar::from_canonical_u64(AUX_COEFF_ABS_MAX as u64);
-        let mut aux_limbs = read_value::<N_LIMBS, _>(lv, MUL_AUX_INPUT_LO);
-        let aux_limbs_hi = &lv[MUL_AUX_INPUT_HI];
-        for (lo, &hi) in aux_limbs.iter_mut().zip(aux_limbs_hi) {
-            *lo += hi * base - offset;
-        }
-        aux_limbs
-    };
 
-    // Constraint poly holds the coefficients of the polynomial that
-    // must be identically zero for this multiplication to be
-    // verified.
-    //
-    // These two lines set constr_poly to the polynomial a(x)b(x) - c(x),
-    // where a, b and c are the polynomials
-    //
-    //   a(x) = \sum_i input0_limbs[i] * x^i
-    //   b(x) = \sum_i input1_limbs[i] * x^i
-    //   c(x) = \sum_i output_limbs[i] * x^i
-    //
-    // This polynomial should equal (x - β)*s(x) where s is
-    //
-    //   s(x) = \sum_i aux_limbs[i] * x^i
-    //
-    let mut constr_poly = pol_mul_lo(input0_limbs, shifted_limbs);
-    pol_sub_assign(&mut constr_poly, &output_limbs);
-
-    // This subtracts (x - β) * s(x) from constr_poly.
-    pol_sub_assign(&mut constr_poly, &pol_adjoin_root(aux_limbs, base));
-
-    // At this point constr_poly holds the coefficients of the
-    // polynomial a(x)b(x) - c(x) - (x - β)*s(x). The
-    // multiplication is valid if and only if all of those
-    // coefficients are zero.
-    for &c in &constr_poly {
-        yield_constr.constraint(is_shl * c);
-    }
+    mul::eval_packed_generic_mul(lv, is_shl, input0_limbs, shifted_limbs, yield_constr);
 }
 
 /// Evaluates the constraints for an SHR opcode.
-/// The logic is very similar to the one for DIV. The only difference is that
+/// The logic is tha same as the one for DIV. The only difference is that
 /// the inputs are in `INPUT_REGISTER_1`  and `INPUT_REGISTER_2` instead of
 /// `INPUT_REGISTER_0` and `INPUT_REGISTER_1`.
 fn eval_packed_shr<P: PackedField>(
@@ -193,28 +104,17 @@ fn eval_packed_shr<P: PackedField>(
     let quo_range = OUTPUT_REGISTER;
     let rem_range = AUX_INPUT_REGISTER_0;
     let filter = lv[IS_SHR];
-    debug_assert!(quo_range.len() == N_LIMBS);
-    debug_assert!(rem_range.len() == N_LIMBS);
-
-    yield_constr.constraint_last_row(filter);
-
-    let num = &lv[INPUT_REGISTER_1];
-    let den = read_value(lv, INPUT_REGISTER_2);
-    let quo = {
-        let mut quo = [P::ZEROS; 2 * N_LIMBS];
-        quo[..N_LIMBS].copy_from_slice(&lv[quo_range]);
-        quo
-    };
-    let rem = read_value(lv, rem_range);
 
-    let mut constr_poly = modular_constr_poly(lv, nv, yield_constr, filter, rem, den, quo);
-
-    let input = num;
-    pol_sub_assign(&mut constr_poly, input);
-
-    for &c in constr_poly.iter() {
-        yield_constr.constraint_transition(filter * c);
-    }
+    divmod::eval_packed_divmod_helper(
+        lv,
+        nv,
+        yield_constr,
+        filter,
+        INPUT_REGISTER_1,
+        INPUT_REGISTER_2,
+        quo_range,
+        rem_range,
+    );
 }
 
 pub fn eval_packed_generic<P: PackedField>(
@@ -235,33 +135,14 @@ pub fn eval_ext_circuit_shl<F: RichField + Extendable<D>, const D: usize>(
     let input0_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
     let shifted_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_2);
 
-    let output_limbs = read_value::<N_LIMBS, _>(lv, OUTPUT_REGISTER);
-
-    let aux_limbs = {
-        let base = builder.constant_extension(F::Extension::from_canonical_u64(1 << LIMB_BITS));
-        let offset =
-            builder.constant_extension(F::Extension::from_canonical_u64(AUX_COEFF_ABS_MAX as u64));
-        let mut aux_limbs = read_value::<N_LIMBS, _>(lv, MUL_AUX_INPUT_LO);
-        let aux_limbs_hi = &lv[MUL_AUX_INPUT_HI];
-        for (lo, &hi) in aux_limbs.iter_mut().zip(aux_limbs_hi) {
-            //*lo = lo + hi * base - offset;
-            let t = builder.mul_sub_extension(hi, base, offset);
-            *lo = builder.add_extension(*lo, t);
-        }
-        aux_limbs
-    };
-
-    let mut constr_poly = pol_mul_lo_ext_circuit(builder, input0_limbs, shifted_limbs);
-    pol_sub_assign_ext_circuit(builder, &mut constr_poly, &output_limbs);
-
-    let base = builder.constant_extension(F::Extension::from_canonical_u64(1 << LIMB_BITS));
-    let rhs = pol_adjoin_root_ext_circuit(builder, aux_limbs, base);
-    pol_sub_assign_ext_circuit(builder, &mut constr_poly, &rhs);
-
-    for &c in &constr_poly {
-        let filter = builder.mul_extension(is_shl, c);
-        yield_constr.constraint(builder, filter);
-    }
+    mul::eval_ext_mul_circuit(
+        builder,
+        lv,
+        is_shl,
+        input0_limbs,
+        shifted_limbs,
+        yield_constr,
+    );
 }
 
 pub fn eval_ext_circuit_shr<F: RichField + Extendable<D>, const D: usize>(
@@ -271,30 +152,20 @@ pub fn eval_ext_circuit_shr<F: RichField + Extendable<D>, const D: usize>(
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
     let filter = lv[IS_SHR];
-    yield_constr.constraint_last_row(builder, filter);
-
     let quo_range = OUTPUT_REGISTER;
     let rem_range = AUX_INPUT_REGISTER_0;
-    let num = &lv[INPUT_REGISTER_1];
-    let den = read_value(lv, INPUT_REGISTER_2);
-    let quo = {
-        let zero = builder.zero_extension();
-        let mut quo = [zero; 2 * N_LIMBS];
-        quo[..N_LIMBS].copy_from_slice(&lv[quo_range]);
-        quo
-    };
-    let rem = read_value(lv, rem_range);
-
-    let mut constr_poly =
-        modular_constr_poly_ext_circuit(lv, nv, builder, yield_constr, filter, rem, den, quo);
-
-    let input = num;
-    pol_sub_assign_ext_circuit(builder, &mut constr_poly, input);
 
-    for &c in constr_poly.iter() {
-        let t = builder.mul_extension(filter, c);
-        yield_constr.constraint_transition(builder, t);
-    }
+    divmod::eval_ext_circuit_divmod_helper(
+        builder,
+        lv,
+        nv,
+        yield_constr,
+        filter,
+        INPUT_REGISTER_1,
+        INPUT_REGISTER_2,
+        quo_range,
+        rem_range,
+    );
 }
 
 pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
@@ -322,14 +193,14 @@ mod tests {
 
     // TODO: Should be able to refactor this test to apply to all operations.
     #[test]
-    fn generate_eval_consistency_not_shl() {
+    fn generate_eval_consistency_not_shift() {
         type F = GoldilocksField;
 
         let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
         let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
         let nv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
 
-        // if `IS_SHL == 0` and ÌS_SHR == 0`, then the constraints should be met even
+        // if `IS_SHL == 0` and `IS_SHR == 0`, then the constraints should be met even
         // if all values are garbage.
         lv[IS_SHL] = F::ZERO;
         lv[IS_SHR] = F::ZERO;
@@ -346,27 +217,25 @@ mod tests {
         }
     }
 
-    #[test]
-    fn generate_eval_consistency_shl() {
+    fn generate_eval_consistency_shift(is_shl: bool) {
         type F = GoldilocksField;
 
         let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
         let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
         let mut nv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
 
-        // set `IS_MUL == 1` and ensure all constraints are satisfied.
-        lv[IS_SHL] = F::ONE;
-        lv[IS_SHR] = F::ZERO;
+        // set `IS_SHL == 1` or `IS_SHR == 1` and ensure all constraints are satisfied.
+        if is_shl {
+            lv[IS_SHL] = F::ONE;
+            lv[IS_SHR] = F::ZERO;
+        } else {
+            lv[IS_SHL] = F::ZERO;
+            lv[IS_SHR] = F::ONE;
+        }
 
         for _i in 0..N_RND_TESTS {
-            let shift = U256::from(rng.gen::<usize>());
-            let shifted = if shift > U256::from(255) {
-                U256::zero()
-            } else {
-                U256::one() << shift
-            };
-            u256_to_array(&mut lv[INPUT_REGISTER_0], shift);
-            u256_to_array(&mut lv[INPUT_REGISTER_2], shifted);
+            let shift = U256::from(rng.gen::<u8>());
+
             let mut full_input = U256::from(0);
             // set inputs to random values
             for ai in INPUT_REGISTER_1 {
@@ -375,15 +244,19 @@ mod tests {
                     U256::from(lv[ai].to_canonical_u64()) + full_input * U256::from(1 << 16);
             }
 
-            let output = full_input * shifted;
+            let output = if is_shl {
+                full_input << shift
+            } else {
+                full_input >> shift
+            };
 
-            generate(&mut lv, &mut nv, true, full_input, shift, output);
+            generate(&mut lv, &mut nv, is_shl, shift, full_input, output);
 
             let mut constraint_consumer = ConstraintConsumer::new(
                 vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
                 GoldilocksField::ONE,
                 GoldilocksField::ONE,
-                GoldilocksField::ONE,
+                GoldilocksField::ZERO,
             );
             eval_packed_generic(&lv, &nv, &mut constraint_consumer);
             for &acc in &constraint_consumer.constraint_accs {
@@ -391,4 +264,14 @@ mod tests {
             }
         }
     }
+
+    #[test]
+    fn generate_eval_consistency_shl() {
+        generate_eval_consistency_shift(true);
+    }
+
+    #[test]
+    fn generate_eval_consistency_shr() {
+        generate_eval_consistency_shift(false);
+    }
 }
diff --git a/evm/src/witness/operation.rs b/evm/src/witness/operation.rs
index 7e5ed02708..568fe4b181 100644
--- a/evm/src/witness/operation.rs
+++ b/evm/src/witness/operation.rs
@@ -504,7 +504,7 @@ fn append_shift<F: Field>(
     } else {
         BinaryOperator::Shr
     };
-    let operation = arithmetic::Operation::binary(operator, input1, input0);
+    let operation = arithmetic::Operation::binary(operator, input0, input1);
 
     state.traces.push_arithmetic(operation);
     state.traces.push_memory(log_in0);

From 39dc14c859875714a4c44226a8b5c49eee50cf0f Mon Sep 17 00:00:00 2001
From: Linda Guiga <lindaguiga3@gmail.com>
Date: Wed, 4 Oct 2023 17:36:25 -0400
Subject: [PATCH 3/4] Fix SHR in the case shift >= 256

---
 evm/src/arithmetic/columns.rs |  2 +-
 evm/src/arithmetic/divmod.rs  |  8 ++++-
 evm/src/arithmetic/modular.rs | 24 ++++++++------
 evm/src/arithmetic/shift.rs   | 61 +++++++++++++++++++++++++++++++++++
 4 files changed, 84 insertions(+), 11 deletions(-)

diff --git a/evm/src/arithmetic/columns.rs b/evm/src/arithmetic/columns.rs
index 36eb983e0b..df2d12476b 100644
--- a/evm/src/arithmetic/columns.rs
+++ b/evm/src/arithmetic/columns.rs
@@ -101,7 +101,7 @@ pub(crate) const MODULAR_OUT_AUX_RED: Range<usize> = AUX_REGISTER_0;
 pub(crate) const MODULAR_MOD_IS_ZERO: usize = AUX_REGISTER_1.start;
 pub(crate) const MODULAR_AUX_INPUT_LO: Range<usize> = AUX_REGISTER_1.start + 1..AUX_REGISTER_1.end;
 pub(crate) const MODULAR_AUX_INPUT_HI: Range<usize> = AUX_REGISTER_2;
-// Must be set to MOD_IS_ZERO for DIV operation i.e. MOD_IS_ZERO * lv[IS_DIV]
+// Must be set to MOD_IS_ZERO for DIV and SHR operations i.e. MOD_IS_ZERO * (lv[IS_DIV] + lv[IS_SHR]).
 pub(crate) const MODULAR_DIV_DENOM_IS_ZERO: usize = AUX_REGISTER_2.end;
 
 /// The counter column (used for the range check) starts from 0 and increments.
diff --git a/evm/src/arithmetic/divmod.rs b/evm/src/arithmetic/divmod.rs
index bd93eea430..9f9d150d32 100644
--- a/evm/src/arithmetic/divmod.rs
+++ b/evm/src/arithmetic/divmod.rs
@@ -57,7 +57,7 @@ pub(crate) fn generate_divmod<F: PrimeField64>(
             );
             lv[AUX_INPUT_REGISTER_0].copy_from_slice(&quo_input[..N_LIMBS]);
         }
-        _ => panic!("expected filter to be IS_DIV or IS_MOD but it was {filter}"),
+        _ => panic!("expected filter to be IS_DIV, IS_SHR or IS_MOD but it was {filter}"),
     };
 }
 /// Generate the output and auxiliary values for modular operations.
@@ -233,6 +233,8 @@ mod tests {
         for op in MODULAR_OPS {
             lv[op] = F::ZERO;
         }
+        // Since SHR uses the logic for DIV, `IS_SHR` should also be set to 0 here.
+        lv[IS_SHR] = F::ZERO;
 
         let mut constraint_consumer = ConstraintConsumer::new(
             vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
@@ -264,6 +266,8 @@ mod tests {
                 for op in MODULAR_OPS {
                     lv[op] = F::ZERO;
                 }
+                // Since SHR uses the logic for DIV, `IS_SHR` should also be set to 0 here.
+                lv[IS_SHR] = F::ZERO;
                 lv[op_filter] = F::ONE;
 
                 let input0 = U256::from(rng.gen::<[u8; 32]>());
@@ -324,6 +328,8 @@ mod tests {
                 for op in MODULAR_OPS {
                     lv[op] = F::ZERO;
                 }
+                // Since SHR uses the logic for DIV, `IS_SHR` should also be set to 0 here.
+                lv[IS_SHR] = F::ZERO;
                 lv[op_filter] = F::ONE;
 
                 let input0 = U256::from(rng.gen::<[u8; 32]>());
diff --git a/evm/src/arithmetic/modular.rs b/evm/src/arithmetic/modular.rs
index 4e540cb6b4..4e6e21a632 100644
--- a/evm/src/arithmetic/modular.rs
+++ b/evm/src/arithmetic/modular.rs
@@ -239,7 +239,7 @@ pub(crate) fn generate_modular_op<F: PrimeField64>(
 
     let mut mod_is_zero = F::ZERO;
     if modulus.is_zero() {
-        if filter == columns::IS_DIV {
+        if filter == columns::IS_DIV || filter == columns::IS_SHR {
             // set modulus = 2^256; the condition above means we know
             // it's zero at this point, so we can just set bit 256.
             modulus.set_bit(256, true);
@@ -330,7 +330,7 @@ pub(crate) fn generate_modular_op<F: PrimeField64>(
 
     nv[MODULAR_MOD_IS_ZERO] = mod_is_zero;
     nv[MODULAR_OUT_AUX_RED].copy_from_slice(&out_aux_red.map(F::from_canonical_i64));
-    nv[MODULAR_DIV_DENOM_IS_ZERO] = mod_is_zero * lv[IS_DIV];
+    nv[MODULAR_DIV_DENOM_IS_ZERO] = mod_is_zero * (lv[IS_DIV] + lv[IS_SHR]);
 
     (
         output_limbs.map(F::from_canonical_i64),
@@ -392,14 +392,14 @@ pub(crate) fn check_reduced<P: PackedField>(
     // Verify that the output is reduced, i.e. output < modulus.
     let out_aux_red = &nv[MODULAR_OUT_AUX_RED];
     // This sets is_less_than to 1 unless we get mod_is_zero when
-    // doing a DIV; in that case, we need is_less_than=0, since
+    // doing a DIV or SHR; in that case, we need is_less_than=0, since
     // eval_packed_generic_addcy checks
     //
     //   modulus + out_aux_red == output + is_less_than*2^256
     //
     // and we are given output = out_aux_red when modulus is zero.
     let mut is_less_than = [P::ZEROS; N_LIMBS];
-    is_less_than[0] = P::ONES - mod_is_zero * lv[IS_DIV];
+    is_less_than[0] = P::ONES - mod_is_zero * (lv[IS_DIV] + lv[IS_SHR]);
     // NB: output and modulus in lv while out_aux_red and
     // is_less_than (via mod_is_zero) depend on nv, hence the
     // 'is_two_row_op' argument is set to 'true'.
@@ -448,13 +448,15 @@ pub(crate) fn modular_constr_poly<P: PackedField>(
     // modulus = 0.
     modulus[0] += mod_is_zero;
 
-    // Is 1 iff the operation is DIV and the denominator is zero.
+    // Is 1 iff the operation is DIV or SHR and the denominator is zero.
     let div_denom_is_zero = nv[MODULAR_DIV_DENOM_IS_ZERO];
-    yield_constr.constraint_transition(filter * (mod_is_zero * lv[IS_DIV] - div_denom_is_zero));
+    yield_constr.constraint_transition(
+        filter * (mod_is_zero * (lv[IS_DIV] + lv[IS_SHR]) - div_denom_is_zero),
+    );
 
     // Needed to compensate for adding mod_is_zero to modulus above,
     // since the call eval_packed_generic_addcy() below subtracts modulus
-    // to verify in the case of a DIV.
+    // to verify in the case of a DIV or SHR.
     output[0] += div_denom_is_zero;
 
     check_reduced(lv, nv, yield_constr, filter, output, modulus, mod_is_zero);
@@ -635,7 +637,8 @@ pub(crate) fn modular_constr_poly_ext_circuit<F: RichField + Extendable<D>, cons
     modulus[0] = builder.add_extension(modulus[0], mod_is_zero);
 
     let div_denom_is_zero = nv[MODULAR_DIV_DENOM_IS_ZERO];
-    let t = builder.mul_sub_extension(mod_is_zero, lv[IS_DIV], div_denom_is_zero);
+    let div_shr_filter = builder.add_extension(lv[IS_DIV], lv[IS_SHR]);
+    let t = builder.mul_sub_extension(mod_is_zero, div_shr_filter, div_denom_is_zero);
     let t = builder.mul_extension(filter, t);
     yield_constr.constraint_transition(builder, t);
     output[0] = builder.add_extension(output[0], div_denom_is_zero);
@@ -645,7 +648,7 @@ pub(crate) fn modular_constr_poly_ext_circuit<F: RichField + Extendable<D>, cons
     let zero = builder.zero_extension();
     let mut is_less_than = [zero; N_LIMBS];
     is_less_than[0] =
-        builder.arithmetic_extension(F::NEG_ONE, F::ONE, mod_is_zero, lv[IS_DIV], one);
+        builder.arithmetic_extension(F::NEG_ONE, F::ONE, mod_is_zero, div_shr_filter, one);
 
     eval_ext_circuit_addcy(
         builder,
@@ -834,6 +837,7 @@ mod tests {
         for op in MODULAR_OPS {
             lv[op] = F::ZERO;
         }
+        lv[IS_SHR] = F::ZERO;
         lv[IS_DIV] = F::ZERO;
         lv[IS_MOD] = F::ZERO;
 
@@ -867,6 +871,7 @@ mod tests {
                 for op in MODULAR_OPS {
                     lv[op] = F::ZERO;
                 }
+                lv[IS_SHR] = F::ZERO;
                 lv[IS_DIV] = F::ZERO;
                 lv[IS_MOD] = F::ZERO;
                 lv[op_filter] = F::ONE;
@@ -926,6 +931,7 @@ mod tests {
                 for op in MODULAR_OPS {
                     lv[op] = F::ZERO;
                 }
+                lv[IS_SHR] = F::ZERO;
                 lv[IS_DIV] = F::ZERO;
                 lv[IS_MOD] = F::ZERO;
                 lv[op_filter] = F::ONE;
diff --git a/evm/src/arithmetic/shift.rs b/evm/src/arithmetic/shift.rs
index 297100d1fb..ddca484a2a 100644
--- a/evm/src/arithmetic/shift.rs
+++ b/evm/src/arithmetic/shift.rs
@@ -229,6 +229,8 @@ mod tests {
             lv[IS_SHL] = F::ONE;
             lv[IS_SHR] = F::ZERO;
         } else {
+            // Set `IS_DIV` to 0 in this case, since we're using the logic of DIV for SHR.
+            lv[IS_DIV] = F::ZERO;
             lv[IS_SHL] = F::ZERO;
             lv[IS_SHR] = F::ONE;
         }
@@ -274,4 +276,63 @@ mod tests {
     fn generate_eval_consistency_shr() {
         generate_eval_consistency_shift(false);
     }
+
+    fn generate_eval_consistency_shift_over_256(is_shl: bool) {
+        type F = GoldilocksField;
+
+        let mut rng = ChaCha8Rng::seed_from_u64(0x6feb51b7ec230f25);
+        let mut lv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
+        let mut nv = [F::default(); NUM_ARITH_COLUMNS].map(|_| F::sample(&mut rng));
+
+        // set `IS_SHL == 1` or `IS_SHR == 1` and ensure all constraints are satisfied.
+        if is_shl {
+            lv[IS_SHL] = F::ONE;
+            lv[IS_SHR] = F::ZERO;
+        } else {
+            // Set `IS_DIV` to 0 in this case, since we're using the logic of DIV for SHR.
+            lv[IS_DIV] = F::ZERO;
+            lv[IS_SHL] = F::ZERO;
+            lv[IS_SHR] = F::ONE;
+        }
+
+        for _i in 0..N_RND_TESTS {
+            let mut shift = U256::from(rng.gen::<usize>());
+            while shift > U256::MAX - 256 {
+                shift = U256::from(rng.gen::<usize>());
+            }
+            shift += U256::from(256);
+
+            let mut full_input = U256::from(0);
+            // set inputs to random values
+            for ai in INPUT_REGISTER_1 {
+                lv[ai] = F::from_canonical_u16(rng.gen());
+                full_input =
+                    U256::from(lv[ai].to_canonical_u64()) + full_input * U256::from(1 << 16);
+            }
+
+            let output = 0.into();
+            generate(&mut lv, &mut nv, is_shl, shift, full_input, output);
+
+            let mut constraint_consumer = ConstraintConsumer::new(
+                vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
+                GoldilocksField::ONE,
+                GoldilocksField::ONE,
+                GoldilocksField::ZERO,
+            );
+            eval_packed_generic(&lv, &nv, &mut constraint_consumer);
+            for &acc in &constraint_consumer.constraint_accs {
+                assert_eq!(acc, GoldilocksField::ZERO);
+            }
+        }
+    }
+
+    #[test]
+    fn generate_eval_consistency_shl_over_256() {
+        generate_eval_consistency_shift_over_256(true);
+    }
+
+    #[test]
+    fn generate_eval_consistency_shr_over_256() {
+        generate_eval_consistency_shift_over_256(false);
+    }
 }

From e8e050b6d430831f15eb5532cf1fa14982d4939c Mon Sep 17 00:00:00 2001
From: Linda Guiga <lindaguiga3@gmail.com>
Date: Thu, 5 Oct 2023 08:44:46 -0400
Subject: [PATCH 4/4] Limit visibility of helper functions

---
 evm/src/arithmetic/divmod.rs | 2 +-
 evm/src/arithmetic/mul.rs    | 6 +++---
 evm/src/arithmetic/shift.rs  | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/evm/src/arithmetic/divmod.rs b/evm/src/arithmetic/divmod.rs
index 9f9d150d32..e143ded6dd 100644
--- a/evm/src/arithmetic/divmod.rs
+++ b/evm/src/arithmetic/divmod.rs
@@ -79,7 +79,7 @@ pub(crate) fn generate<F: PrimeField64>(
 }
 
 /// Verify that num = quo * den + rem and 0 <= rem < den.
-pub fn eval_packed_divmod_helper<P: PackedField>(
+pub(crate) fn eval_packed_divmod_helper<P: PackedField>(
     lv: &[P; NUM_ARITH_COLUMNS],
     nv: &[P; NUM_ARITH_COLUMNS],
     yield_constr: &mut ConstraintConsumer<P>,
diff --git a/evm/src/arithmetic/mul.rs b/evm/src/arithmetic/mul.rs
index bec9a0e9ce..c09c39d8dc 100644
--- a/evm/src/arithmetic/mul.rs
+++ b/evm/src/arithmetic/mul.rs
@@ -68,7 +68,7 @@ use crate::arithmetic::utils::*;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 
 /// Given the two limbs of `left_in` and `right_in`, computes `left_in * right_in`.
-pub fn generate_mul<F: PrimeField64>(lv: &mut [F], left_in: [i64; 16], right_in: [i64; 16]) {
+pub(crate) fn generate_mul<F: PrimeField64>(lv: &mut [F], left_in: [i64; 16], right_in: [i64; 16]) {
     const MASK: i64 = (1i64 << LIMB_BITS) - 1i64;
 
     // Input and output have 16-bit limbs
@@ -120,7 +120,7 @@ pub fn generate<F: PrimeField64>(lv: &mut [F], left_in: U256, right_in: U256) {
     generate_mul(lv, input0, input1);
 }
 
-pub fn eval_packed_generic_mul<P: PackedField>(
+pub(crate) fn eval_packed_generic_mul<P: PackedField>(
     lv: &[P; NUM_ARITH_COLUMNS],
     filter: P,
     left_in_limbs: [P; 16],
@@ -184,7 +184,7 @@ pub fn eval_packed_generic<P: PackedField>(
     eval_packed_generic_mul(lv, is_mul, input0_limbs, input1_limbs, yield_constr);
 }
 
-pub fn eval_ext_mul_circuit<F: RichField + Extendable<D>, const D: usize>(
+pub(crate) fn eval_ext_mul_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     filter: ExtensionTarget<D>,
diff --git a/evm/src/arithmetic/shift.rs b/evm/src/arithmetic/shift.rs
index ddca484a2a..6600c01e54 100644
--- a/evm/src/arithmetic/shift.rs
+++ b/evm/src/arithmetic/shift.rs
@@ -81,7 +81,7 @@ pub fn generate<F: PrimeField64>(
 /// The logic is the same as the one for MUL. The only difference is that
 /// the inputs are in `INPUT_REGISTER_1`  and `INPUT_REGISTER_2` instead of
 /// `INPUT_REGISTER_0` and `INPUT_REGISTER_1`.
-pub fn eval_packed_shl<P: PackedField>(
+fn eval_packed_shl<P: PackedField>(
     lv: &[P; NUM_ARITH_COLUMNS],
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
@@ -126,7 +126,7 @@ pub fn eval_packed_generic<P: PackedField>(
     eval_packed_shr(lv, nv, yield_constr);
 }
 
-pub fn eval_ext_circuit_shl<F: RichField + Extendable<D>, const D: usize>(
+fn eval_ext_circuit_shl<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
@@ -145,7 +145,7 @@ pub fn eval_ext_circuit_shl<F: RichField + Extendable<D>, const D: usize>(
     );
 }
 
-pub fn eval_ext_circuit_shr<F: RichField + Extendable<D>, const D: usize>(
+fn eval_ext_circuit_shr<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     nv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],