diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml
index ba2ad1bd9b..a0ac3ec727 100644
--- a/.github/workflows/continuous-integration-workflow.yml
+++ b/.github/workflows/continuous-integration-workflow.yml
@@ -124,5 +124,5 @@ jobs:
           command: clippy
           args: --all-features --all-targets -- -D warnings -A incomplete-features
         env:
-          CARGO_INCREMENTAL: 1
-
+          # Seems necessary until https://github.com/rust-lang/rust/pull/115819 is merged.
+          CARGO_INCREMENTAL: 0
diff --git a/evm/src/arithmetic/arithmetic_stark.rs b/evm/src/arithmetic/arithmetic_stark.rs
index 4695798af5..5441cf2760 100644
--- a/evm/src/arithmetic/arithmetic_stark.rs
+++ b/evm/src/arithmetic/arithmetic_stark.rs
@@ -27,10 +27,17 @@ use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};
 /// This is done by taking pairs of columns (x, y) of the arithmetic
 /// table and combining them as x + y*2^16 to ensure they equal the
 /// corresponding 32-bit number in the CPU table.
-fn cpu_arith_data_link<F: Field>(ops: &[usize], regs: &[Range<usize>]) -> Vec<Column<F>> {
+fn cpu_arith_data_link<F: Field>(
+    combined_ops: &[(usize, u8)],
+    regs: &[Range<usize>],
+) -> Vec<Column<F>> {
     let limb_base = F::from_canonical_u64(1 << columns::LIMB_BITS);
 
-    let mut res = Column::singles(ops).collect_vec();
+    let mut res = vec![Column::linear_combination(
+        combined_ops
+            .iter()
+            .map(|&(col, code)| (col, F::from_canonical_u8(code))),
+    )];
 
     // The inner for loop below assumes N_LIMBS is even.
     const_assert!(columns::N_LIMBS % 2 == 0);
@@ -49,21 +56,27 @@ fn cpu_arith_data_link<F: Field>(ops: &[usize], regs: &[Range<usize>]) -> Vec<Co
 }
 
 pub fn ctl_arithmetic_rows<F: Field>() -> TableWithColumns<F> {
-    const ARITH_OPS: [usize; 14] = [
-        columns::IS_ADD,
-        columns::IS_SUB,
-        columns::IS_MUL,
-        columns::IS_LT,
-        columns::IS_GT,
-        columns::IS_ADDFP254,
-        columns::IS_MULFP254,
-        columns::IS_SUBFP254,
-        columns::IS_ADDMOD,
-        columns::IS_MULMOD,
-        columns::IS_SUBMOD,
-        columns::IS_DIV,
-        columns::IS_MOD,
-        columns::IS_BYTE,
+    // We scale each filter flag with the associated opcode value.
+    // If an arithmetic operation is happening on the CPU side,
+    // the CTL will enforce that the reconstructed opcode value
+    // from the opcode bits matches.
+    const COMBINED_OPS: [(usize, u8); 16] = [
+        (columns::IS_ADD, 0x01),
+        (columns::IS_MUL, 0x02),
+        (columns::IS_SUB, 0x03),
+        (columns::IS_DIV, 0x04),
+        (columns::IS_MOD, 0x06),
+        (columns::IS_ADDMOD, 0x08),
+        (columns::IS_MULMOD, 0x09),
+        (columns::IS_ADDFP254, 0x0c),
+        (columns::IS_MULFP254, 0x0d),
+        (columns::IS_SUBFP254, 0x0e),
+        (columns::IS_SUBMOD, 0x0f),
+        (columns::IS_LT, 0x10),
+        (columns::IS_GT, 0x11),
+        (columns::IS_BYTE, 0x1a),
+        (columns::IS_SHL, 0x1b),
+        (columns::IS_SHR, 0x1c),
     ];
 
     const REGISTER_MAP: [Range<usize>; 4] = [
@@ -73,6 +86,8 @@ pub fn ctl_arithmetic_rows<F: Field>() -> TableWithColumns<F> {
         columns::OUTPUT_REGISTER,
     ];
 
+    let filter_column = Some(Column::sum(COMBINED_OPS.iter().map(|(c, _v)| *c)));
+
     // Create the Arithmetic Table whose columns are those of the
     // operations listed in `ops` whose inputs and outputs are given
     // by `regs`, where each element of `regs` is a range of columns
@@ -80,8 +95,8 @@ pub fn ctl_arithmetic_rows<F: Field>() -> TableWithColumns<F> {
     // is used as the operation filter).
     TableWithColumns::new(
         Table::Arithmetic,
-        cpu_arith_data_link(&ARITH_OPS, &REGISTER_MAP),
-        Some(Column::sum(ARITH_OPS)),
+        cpu_arith_data_link(&COMBINED_OPS, &REGISTER_MAP),
+        filter_column,
     )
 }
 
diff --git a/evm/src/arithmetic/columns.rs b/evm/src/arithmetic/columns.rs
index afdd583261..48e00f8e11 100644
--- a/evm/src/arithmetic/columns.rs
+++ b/evm/src/arithmetic/columns.rs
@@ -36,8 +36,10 @@ pub(crate) const IS_SUBMOD: usize = IS_SUBFP254 + 1;
 pub(crate) const IS_LT: usize = IS_SUBMOD + 1;
 pub(crate) const IS_GT: usize = IS_LT + 1;
 pub(crate) const IS_BYTE: usize = IS_GT + 1;
+pub(crate) const IS_SHL: usize = IS_BYTE + 1;
+pub(crate) const IS_SHR: usize = IS_SHL + 1;
 
-pub(crate) const START_SHARED_COLS: usize = IS_BYTE + 1;
+pub(crate) const START_SHARED_COLS: usize = IS_SHR + 1;
 
 /// Within the Arithmetic Unit, there are shared columns which can be
 /// used by any arithmetic circuit, depending on which one is active
diff --git a/evm/src/arithmetic/divmod.rs b/evm/src/arithmetic/divmod.rs
index 4f2dd748ec..258c131f32 100644
--- a/evm/src/arithmetic/divmod.rs
+++ b/evm/src/arithmetic/divmod.rs
@@ -45,7 +45,7 @@ pub(crate) fn generate<F: PrimeField64>(
     }
 
     match filter {
-        IS_DIV => {
+        IS_DIV | IS_SHR => {
             debug_assert!(
                 lv[OUTPUT_REGISTER]
                     .iter()
@@ -104,11 +104,14 @@ pub(crate) fn eval_packed<P: PackedField>(
     nv: &[P; NUM_ARITH_COLUMNS],
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
+    // Constrain IS_SHR independently, so that it doesn't impact the
+    // constraints when combining the flag with IS_DIV.
+    yield_constr.constraint_last_row(lv[IS_SHR]);
     eval_packed_divmod_helper(
         lv,
         nv,
         yield_constr,
-        lv[IS_DIV],
+        lv[IS_DIV] + lv[IS_SHR],
         OUTPUT_REGISTER,
         AUX_INPUT_REGISTER_0,
     );
@@ -161,12 +164,14 @@ pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     nv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
+    yield_constr.constraint_last_row(builder, lv[IS_SHR]);
+    let div_shr_flag = builder.add_extension(lv[IS_DIV], lv[IS_SHR]);
     eval_ext_circuit_divmod_helper(
         builder,
         lv,
         nv,
         yield_constr,
-        lv[IS_DIV],
+        div_shr_flag,
         OUTPUT_REGISTER,
         AUX_INPUT_REGISTER_0,
     );
@@ -209,6 +214,8 @@ mod tests {
         for op in MODULAR_OPS {
             lv[op] = F::ZERO;
         }
+        // Deactivate the SHR flag so that a DIV operation is not triggered.
+        lv[IS_SHR] = F::ZERO;
 
         let mut constraint_consumer = ConstraintConsumer::new(
             vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
@@ -240,6 +247,7 @@ mod tests {
                 for op in MODULAR_OPS {
                     lv[op] = F::ZERO;
                 }
+                lv[IS_SHR] = F::ZERO;
                 lv[op_filter] = F::ONE;
 
                 let input0 = U256::from(rng.gen::<[u8; 32]>());
@@ -300,6 +308,7 @@ mod tests {
                 for op in MODULAR_OPS {
                     lv[op] = F::ZERO;
                 }
+                lv[IS_SHR] = F::ZERO;
                 lv[op_filter] = F::ONE;
 
                 let input0 = U256::from(rng.gen::<[u8; 32]>());
diff --git a/evm/src/arithmetic/mod.rs b/evm/src/arithmetic/mod.rs
index d9d63a0b82..bd6d56e8cb 100644
--- a/evm/src/arithmetic/mod.rs
+++ b/evm/src/arithmetic/mod.rs
@@ -27,15 +27,17 @@ pub(crate) enum BinaryOperator {
     MulFp254,
     SubFp254,
     Byte,
+    Shl, // simulated with MUL
+    Shr, // simulated with DIV
 }
 
 impl BinaryOperator {
     pub(crate) fn result(&self, input0: U256, input1: U256) -> U256 {
         match self {
             BinaryOperator::Add => input0.overflowing_add(input1).0,
-            BinaryOperator::Mul => input0.overflowing_mul(input1).0,
+            BinaryOperator::Mul | BinaryOperator::Shl => input0.overflowing_mul(input1).0,
             BinaryOperator::Sub => input0.overflowing_sub(input1).0,
-            BinaryOperator::Div => {
+            BinaryOperator::Div | BinaryOperator::Shr => {
                 if input1.is_zero() {
                     U256::zero()
                 } else {
@@ -77,6 +79,8 @@ impl BinaryOperator {
             BinaryOperator::MulFp254 => columns::IS_MULFP254,
             BinaryOperator::SubFp254 => columns::IS_SUBFP254,
             BinaryOperator::Byte => columns::IS_BYTE,
+            BinaryOperator::Shl => columns::IS_SHL,
+            BinaryOperator::Shr => columns::IS_SHR,
         }
     }
 }
@@ -107,6 +111,7 @@ impl TernaryOperator {
     }
 }
 
+/// An enum representing arithmetic operations that can be either binary or ternary.
 #[derive(Debug)]
 pub(crate) enum Operation {
     BinaryOperation {
@@ -125,6 +130,21 @@ pub(crate) enum Operation {
 }
 
 impl Operation {
+    /// Create a binary operator with given inputs.
+    ///
+    /// NB: This works as you would expect, EXCEPT for SHL and SHR,
+    /// whose inputs need a small amount of preprocessing. Specifically,
+    /// to create `SHL(shift, value)`, call (note the reversal of
+    /// argument order):
+    ///
+    ///    `Operation::binary(BinaryOperator::Shl, value, 1 << shift)`
+    ///
+    /// Similarly, to create `SHR(shift, value)`, call
+    ///
+    ///    `Operation::binary(BinaryOperator::Shr, value, 1 << shift)`
+    ///
+    /// See witness/operation.rs::append_shift() for an example (indeed
+    /// the only call site for such inputs).
     pub(crate) fn binary(operator: BinaryOperator, input0: U256, input1: U256) -> Self {
         let result = operator.result(input0, input1);
         Self::BinaryOperation {
@@ -164,6 +184,10 @@ impl Operation {
     /// use vectors because that's what utils::transpose (who consumes
     /// the result of this function as part of the range check code)
     /// expects.
+    ///
+    /// The `is_simulated` bool indicates whether we use a native arithmetic
+    /// operation or simulate one with another. This is used to distinguish
+    /// SHL and SHR operations that are simulated through MUL and DIV respectively.
     fn to_rows<F: PrimeField64>(&self) -> (Vec<F>, Option<Vec<F>>) {
         match *self {
             Operation::BinaryOperation {
@@ -214,11 +238,11 @@ fn binary_op_to_rows<F: PrimeField64>(
             addcy::generate(&mut row, op.row_filter(), input0, input1);
             (row, None)
         }
-        BinaryOperator::Mul => {
+        BinaryOperator::Mul | BinaryOperator::Shl => {
             mul::generate(&mut row, input0, input1);
             (row, None)
         }
-        BinaryOperator::Div | BinaryOperator::Mod => {
+        BinaryOperator::Div | BinaryOperator::Mod | BinaryOperator::Shr => {
             let mut nv = vec![F::ZERO; columns::NUM_ARITH_COLUMNS];
             divmod::generate(&mut row, &mut nv, op.row_filter(), input0, input1, result);
             (row, Some(nv))
diff --git a/evm/src/arithmetic/mul.rs b/evm/src/arithmetic/mul.rs
index 597d405192..efb4d82247 100644
--- a/evm/src/arithmetic/mul.rs
+++ b/evm/src/arithmetic/mul.rs
@@ -121,7 +121,7 @@ pub fn eval_packed_generic<P: PackedField>(
 ) {
     let base = P::Scalar::from_canonical_u64(1 << LIMB_BITS);
 
-    let is_mul = lv[IS_MUL];
+    let is_mul = lv[IS_MUL] + lv[IS_SHL];
     let input0_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_0);
     let input1_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
     let output_limbs = read_value::<N_LIMBS, _>(lv, OUTPUT_REGISTER);
@@ -173,7 +173,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     lv: &[ExtensionTarget<D>; NUM_ARITH_COLUMNS],
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let is_mul = lv[IS_MUL];
+    let is_mul = builder.add_extension(lv[IS_MUL], lv[IS_SHL]);
     let input0_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_0);
     let input1_limbs = read_value::<N_LIMBS, _>(lv, INPUT_REGISTER_1);
     let output_limbs = read_value::<N_LIMBS, _>(lv, OUTPUT_REGISTER);
@@ -229,6 +229,8 @@ mod tests {
         // if `IS_MUL == 0`, then the constraints should be met even
         // if all values are garbage.
         lv[IS_MUL] = F::ZERO;
+        // Deactivate the SHL flag so that a MUL operation is not triggered.
+        lv[IS_SHL] = F::ZERO;
 
         let mut constraint_consumer = ConstraintConsumer::new(
             vec![GoldilocksField(2), GoldilocksField(3), GoldilocksField(5)],
diff --git a/evm/src/cpu/bootstrap_kernel.rs b/evm/src/cpu/bootstrap_kernel.rs
index 66f88d3ae1..4aee617c53 100644
--- a/evm/src/cpu/bootstrap_kernel.rs
+++ b/evm/src/cpu/bootstrap_kernel.rs
@@ -25,6 +25,7 @@ pub(crate) fn generate_bootstrap_kernel<F: Field>(state: &mut GenerationState<F>
     for chunk in &KERNEL.code.iter().enumerate().chunks(NUM_GP_CHANNELS) {
         let mut cpu_row = CpuColumnsView::default();
         cpu_row.clock = F::from_canonical_usize(state.traces.clock());
+        cpu_row.is_bootstrap_kernel = F::ONE;
 
         // Write this chunk to memory, while simultaneously packing its bytes into a u32 word.
         for (channel, (addr, &byte)) in chunk.enumerate() {
@@ -39,6 +40,7 @@ pub(crate) fn generate_bootstrap_kernel<F: Field>(state: &mut GenerationState<F>
 
     let mut final_cpu_row = CpuColumnsView::default();
     final_cpu_row.clock = F::from_canonical_usize(state.traces.clock());
+    final_cpu_row.is_bootstrap_kernel = F::ONE;
     final_cpu_row.is_keccak_sponge = F::ONE;
     // The Keccak sponge CTL uses memory value columns for its inputs and outputs.
     final_cpu_row.mem_channels[0].value[0] = F::ZERO; // context
@@ -64,8 +66,8 @@ pub(crate) fn eval_bootstrap_kernel<F: Field, P: PackedField<Scalar = F>>(
     let next_values: &CpuColumnsView<_> = vars.next_values.borrow();
 
     // IS_BOOTSTRAP_KERNEL must have an init value of 1, a final value of 0, and a delta in {0, -1}.
-    let local_is_bootstrap = P::ONES - local_values.op.into_iter().sum::<P>();
-    let next_is_bootstrap = P::ONES - next_values.op.into_iter().sum::<P>();
+    let local_is_bootstrap = local_values.is_bootstrap_kernel;
+    let next_is_bootstrap = next_values.is_bootstrap_kernel;
     yield_constr.constraint_first_row(local_is_bootstrap - P::ONES);
     yield_constr.constraint_last_row(local_is_bootstrap);
     let delta_is_bootstrap = next_is_bootstrap - local_is_bootstrap;
@@ -111,10 +113,8 @@ pub(crate) fn eval_bootstrap_kernel_circuit<F: RichField + Extendable<D>, const
     let one = builder.one_extension();
 
     // IS_BOOTSTRAP_KERNEL must have an init value of 1, a final value of 0, and a delta in {0, -1}.
-    let local_is_bootstrap = builder.add_many_extension(local_values.op.iter());
-    let local_is_bootstrap = builder.sub_extension(one, local_is_bootstrap);
-    let next_is_bootstrap = builder.add_many_extension(next_values.op.iter());
-    let next_is_bootstrap = builder.sub_extension(one, next_is_bootstrap);
+    let local_is_bootstrap = local_values.is_bootstrap_kernel;
+    let next_is_bootstrap = next_values.is_bootstrap_kernel;
     let constraint = builder.sub_extension(local_is_bootstrap, one);
     yield_constr.constraint_first_row(builder, constraint);
     yield_constr.constraint_last_row(builder, local_is_bootstrap);
diff --git a/evm/src/cpu/columns/mod.rs b/evm/src/cpu/columns/mod.rs
index 134ab02b49..fecc8df986 100644
--- a/evm/src/cpu/columns/mod.rs
+++ b/evm/src/cpu/columns/mod.rs
@@ -35,6 +35,9 @@ pub struct MemoryChannelView<T: Copy> {
 #[repr(C)]
 #[derive(Clone, Copy, Eq, PartialEq, Debug)]
 pub struct CpuColumnsView<T: Copy> {
+    /// Filter. 1 if the row is part of bootstrapping the kernel code, 0 otherwise.
+    pub is_bootstrap_kernel: T,
+
     /// If CPU cycle: Current context.
     // TODO: this is currently unconstrained
     pub context: T,
diff --git a/evm/src/cpu/columns/ops.rs b/evm/src/cpu/columns/ops.rs
index 6c68a18305..d4d753f7cf 100644
--- a/evm/src/cpu/columns/ops.rs
+++ b/evm/src/cpu/columns/ops.rs
@@ -7,33 +7,17 @@ use crate::util::{indices_arr, transmute_no_compile_time_size_checks};
 #[repr(C)]
 #[derive(Clone, Copy, Eq, PartialEq, Debug)]
 pub struct OpsColumnsView<T: Copy> {
-    // TODO: combine ADD, MUL, SUB, DIV, MOD, ADDFP254, MULFP254, SUBFP254, LT, and GT into one flag
-    pub add: T,
-    pub mul: T,
-    pub sub: T,
-    pub div: T,
-    pub mod_: T,
-    // TODO: combine ADDMOD, MULMOD and SUBMOD into one flag
-    pub addmod: T,
-    pub mulmod: T,
-    pub addfp254: T,
-    pub mulfp254: T,
-    pub subfp254: T,
-    pub submod: T,
-    pub lt: T,
-    pub gt: T,
-    pub eq_iszero: T, // Combines EQ and ISZERO flags.
-    pub logic_op: T,  // Combines AND, OR and XOR flags.
+    pub binary_op: T,  // Combines ADD, MUL, SUB, DIV, MOD, LT, GT and BYTE flags.
+    pub ternary_op: T, // Combines ADDMOD, MULMOD and SUBMOD flags.
+    pub fp254_op: T,   // Combines ADD_FP254, MUL_FP254 and SUB_FP254 flags.
+    pub eq_iszero: T,  // Combines EQ and ISZERO flags.
+    pub logic_op: T,   // Combines AND, OR and XOR flags.
     pub not: T,
-    pub byte: T,
-    // TODO: combine SHL and SHR into one flag
-    pub shl: T,
-    pub shr: T,
+    pub shift: T, // Combines SHL and SHR flags.
     pub keccak_general: T,
     pub prover_input: T,
     pub pop: T,
-    // TODO: combine JUMP and JUMPI into one flag
-    pub jumps: T, // Note: This column must be 0 when is_cpu_cycle = 0.
+    pub jumps: T, // Combines JUMP and JUMPI flags.
     pub pc: T,
     pub jumpdest: T,
     pub push0: T,
@@ -44,9 +28,7 @@ pub struct OpsColumnsView<T: Copy> {
     pub mstore_32bytes: T,
     pub mload_32bytes: T,
     pub exit_kernel: T,
-    // TODO: combine MLOAD_GENERAL and MSTORE_GENERAL into one flag
-    pub mload_general: T,
-    pub mstore_general: T,
+    pub m_op_general: T,
 
     pub syscall: T,
     pub exception: T,
diff --git a/evm/src/cpu/control_flow.rs b/evm/src/cpu/control_flow.rs
index 0bea5c7c70..9c17367aa2 100644
--- a/evm/src/cpu/control_flow.rs
+++ b/evm/src/cpu/control_flow.rs
@@ -8,24 +8,14 @@ use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer
 use crate::cpu::columns::{CpuColumnsView, COL_MAP};
 use crate::cpu::kernel::aggregator::KERNEL;
 
-const NATIVE_INSTRUCTIONS: [usize; 28] = [
-    COL_MAP.op.add,
-    COL_MAP.op.mul,
-    COL_MAP.op.sub,
-    COL_MAP.op.div,
-    COL_MAP.op.mod_,
-    COL_MAP.op.addmod,
-    COL_MAP.op.mulmod,
-    COL_MAP.op.addfp254,
-    COL_MAP.op.mulfp254,
-    COL_MAP.op.subfp254,
-    COL_MAP.op.lt,
-    COL_MAP.op.gt,
+const NATIVE_INSTRUCTIONS: [usize; 17] = [
+    COL_MAP.op.binary_op,
+    COL_MAP.op.ternary_op,
+    COL_MAP.op.fp254_op,
     COL_MAP.op.eq_iszero,
     COL_MAP.op.logic_op,
     COL_MAP.op.not,
-    COL_MAP.op.shl,
-    COL_MAP.op.shr,
+    COL_MAP.op.shift,
     COL_MAP.op.keccak_general,
     COL_MAP.op.prover_input,
     COL_MAP.op.pop,
@@ -39,20 +29,14 @@ const NATIVE_INSTRUCTIONS: [usize; 28] = [
     COL_MAP.op.swap,
     COL_MAP.op.context_op,
     // not EXIT_KERNEL (performs a jump)
-    COL_MAP.op.mload_general,
-    COL_MAP.op.mstore_general,
+    COL_MAP.op.m_op_general,
     // not SYSCALL (performs a jump)
     // not exceptions (also jump)
 ];
 
-pub(crate) fn get_halt_pcs<F: Field>() -> (F, F) {
-    let halt_pc0 = KERNEL.global_labels["halt_pc0"];
-    let halt_pc1 = KERNEL.global_labels["halt_pc1"];
-
-    (
-        F::from_canonical_usize(halt_pc0),
-        F::from_canonical_usize(halt_pc1),
-    )
+pub(crate) fn get_halt_pc<F: Field>() -> F {
+    let halt_pc = KERNEL.global_labels["halt"];
+    F::from_canonical_usize(halt_pc)
 }
 
 pub(crate) fn get_start_pc<F: Field>() -> F {
@@ -68,8 +52,15 @@ pub fn eval_packed_generic<P: PackedField>(
 ) {
     let is_cpu_cycle: P = COL_MAP.op.iter().map(|&col_i| lv[col_i]).sum();
     let is_cpu_cycle_next: P = COL_MAP.op.iter().map(|&col_i| nv[col_i]).sum();
-    // Once we start executing instructions, then we continue until the end of the table.
-    yield_constr.constraint_transition(is_cpu_cycle * (is_cpu_cycle_next - P::ONES));
+
+    let next_halt_state = P::ONES - nv.is_bootstrap_kernel - is_cpu_cycle_next;
+
+    // Once we start executing instructions, then we continue until the end of the table
+    // or we reach dummy padding rows. This, along with the constraints on the first row,
+    // enforces that operation flags and the halt flag are mutually exclusive over the entire
+    // CPU trace.
+    yield_constr
+        .constraint_transition(is_cpu_cycle * (is_cpu_cycle_next + next_halt_state - P::ONES));
 
     // If a row is a CPU cycle and executing a native instruction (implemented as a table row; not
     // microcoded) then the program counter is incremented by 1 to obtain the next row's program
@@ -90,16 +81,6 @@ pub fn eval_packed_generic<P: PackedField>(
     yield_constr.constraint_transition(is_last_noncpu_cycle * pc_diff);
     yield_constr.constraint_transition(is_last_noncpu_cycle * (nv.is_kernel_mode - P::ONES));
     yield_constr.constraint_transition(is_last_noncpu_cycle * nv.stack_len);
-
-    // The last row must be a CPU cycle row.
-    yield_constr.constraint_last_row(is_cpu_cycle - P::ONES);
-    // Also, the last row's `program_counter` must be inside the `halt` infinite loop. Note that
-    // that loop consists of two instructions, so we must check for `halt` and `halt_inner` labels.
-    let (halt_pc0, halt_pc1) = get_halt_pcs::<P::Scalar>();
-    yield_constr
-        .constraint_last_row((lv.program_counter - halt_pc0) * (lv.program_counter - halt_pc1));
-    // Finally, the last row must be in kernel mode.
-    yield_constr.constraint_last_row(lv.is_kernel_mode - P::ONES);
 }
 
 pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
@@ -108,11 +89,21 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
+    let one = builder.one_extension();
+
     let is_cpu_cycle = builder.add_many_extension(COL_MAP.op.iter().map(|&col_i| lv[col_i]));
     let is_cpu_cycle_next = builder.add_many_extension(COL_MAP.op.iter().map(|&col_i| nv[col_i]));
-    // Once we start executing instructions, then we continue until the end of the table.
+
+    let next_halt_state = builder.add_extension(nv.is_bootstrap_kernel, is_cpu_cycle_next);
+    let next_halt_state = builder.sub_extension(one, next_halt_state);
+
+    // Once we start executing instructions, then we continue until the end of the table
+    // or we reach dummy padding rows. This, along with the constraints on the first row,
+    // enforces that operation flags and the halt flag are mutually exclusive over the entire
+    // CPU trace.
     {
-        let constr = builder.mul_sub_extension(is_cpu_cycle, is_cpu_cycle_next, is_cpu_cycle);
+        let constr = builder.add_extension(is_cpu_cycle_next, next_halt_state);
+        let constr = builder.mul_sub_extension(is_cpu_cycle, constr, is_cpu_cycle);
         yield_constr.constraint_transition(builder, constr);
     }
 
@@ -155,30 +146,4 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         let kernel_constr = builder.mul_extension(is_last_noncpu_cycle, nv.stack_len);
         yield_constr.constraint_transition(builder, kernel_constr);
     }
-
-    // The last row must be a CPU cycle row.
-    {
-        let one = builder.one_extension();
-        let constr = builder.sub_extension(is_cpu_cycle, one);
-        yield_constr.constraint_last_row(builder, constr);
-    }
-    // Also, the last row's `program_counter` must be inside the `halt` infinite loop. Note that
-    // that loop consists of two instructions, so we must check for `halt` and `halt_inner` labels.
-    {
-        let (halt_pc0, halt_pc1) = get_halt_pcs();
-        let halt_pc0_target = builder.constant_extension(halt_pc0);
-        let halt_pc1_target = builder.constant_extension(halt_pc1);
-
-        let halt_pc0_offset = builder.sub_extension(lv.program_counter, halt_pc0_target);
-        let halt_pc1_offset = builder.sub_extension(lv.program_counter, halt_pc1_target);
-        let constr = builder.mul_extension(halt_pc0_offset, halt_pc1_offset);
-
-        yield_constr.constraint_last_row(builder, constr);
-    }
-    // Finally, the last row must be in kernel mode.
-    {
-        let one = builder.one_extension();
-        let constr = builder.sub_extension(lv.is_kernel_mode, one);
-        yield_constr.constraint_last_row(builder, constr);
-    }
 }
diff --git a/evm/src/cpu/cpu_stark.rs b/evm/src/cpu/cpu_stark.rs
index 25e7cc6ba0..bd2fcf193b 100644
--- a/evm/src/cpu/cpu_stark.rs
+++ b/evm/src/cpu/cpu_stark.rs
@@ -8,6 +8,7 @@ use plonky2::field::packed::PackedField;
 use plonky2::field::types::Field;
 use plonky2::hash::hash_types::RichField;
 
+use super::halt;
 use crate::all_stark::Table;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::cpu::columns::{CpuColumnsView, COL_MAP, NUM_CPU_COLUMNS};
@@ -48,9 +49,8 @@ pub fn ctl_filter_keccak_sponge<F: Field>() -> Column<F> {
 
 /// Create the vector of Columns corresponding to the two inputs and
 /// one output of a binary operation.
-fn ctl_data_binops<F: Field>(ops: &[usize]) -> Vec<Column<F>> {
-    let mut res = Column::singles(ops).collect_vec();
-    res.extend(Column::singles(COL_MAP.mem_channels[0].value));
+fn ctl_data_binops<F: Field>() -> Vec<Column<F>> {
+    let mut res = Column::singles(COL_MAP.mem_channels[0].value).collect_vec();
     res.extend(Column::singles(COL_MAP.mem_channels[1].value));
     res.extend(Column::singles(
         COL_MAP.mem_channels[NUM_GP_CHANNELS - 1].value,
@@ -70,10 +70,9 @@ fn ctl_data_binops<F: Field>(ops: &[usize]) -> Vec<Column<F>> {
 /// case of shift operations, which will skip the first memory channel and use the
 /// next three as ternary inputs. Because both `MUL` and `DIV` are binary operations,
 /// the last memory channel used for the inputs will be safely ignored.
-fn ctl_data_ternops<F: Field>(ops: &[usize], is_shift: bool) -> Vec<Column<F>> {
+fn ctl_data_ternops<F: Field>(is_shift: bool) -> Vec<Column<F>> {
     let offset = is_shift as usize;
-    let mut res = Column::singles(ops).collect_vec();
-    res.extend(Column::singles(COL_MAP.mem_channels[offset].value));
+    let mut res = Column::singles(COL_MAP.mem_channels[offset].value).collect_vec();
     res.extend(Column::singles(COL_MAP.mem_channels[offset + 1].value));
     res.extend(Column::singles(COL_MAP.mem_channels[offset + 2].value));
     res.extend(Column::singles(
@@ -85,7 +84,7 @@ fn ctl_data_ternops<F: Field>(ops: &[usize], is_shift: bool) -> Vec<Column<F>> {
 pub fn ctl_data_logic<F: Field>() -> Vec<Column<F>> {
     // Instead of taking single columns, we reconstruct the entire opcode value directly.
     let mut res = vec![Column::le_bits(COL_MAP.opcode_bits)];
-    res.extend(ctl_data_binops(&[]));
+    res.extend(ctl_data_binops());
     res
 }
 
@@ -94,22 +93,9 @@ pub fn ctl_filter_logic<F: Field>() -> Column<F> {
 }
 
 pub fn ctl_arithmetic_base_rows<F: Field>() -> TableWithColumns<F> {
-    const OPS: [usize; 14] = [
-        COL_MAP.op.add,
-        COL_MAP.op.sub,
-        COL_MAP.op.mul,
-        COL_MAP.op.lt,
-        COL_MAP.op.gt,
-        COL_MAP.op.addfp254,
-        COL_MAP.op.mulfp254,
-        COL_MAP.op.subfp254,
-        COL_MAP.op.addmod,
-        COL_MAP.op.mulmod,
-        COL_MAP.op.submod,
-        COL_MAP.op.div,
-        COL_MAP.op.mod_,
-        COL_MAP.op.byte,
-    ];
+    // Instead of taking single columns, we reconstruct the entire opcode value directly.
+    let mut columns = vec![Column::le_bits(COL_MAP.opcode_bits)];
+    columns.extend(ctl_data_ternops(false));
     // Create the CPU Table whose columns are those with the three
     // inputs and one output of the ternary operations listed in `ops`
     // (also `ops` is used as the operation filter). The list of
@@ -117,40 +103,25 @@ pub fn ctl_arithmetic_base_rows<F: Field>() -> TableWithColumns<F> {
     // the third input.
     TableWithColumns::new(
         Table::Cpu,
-        ctl_data_ternops(&OPS, false),
-        Some(Column::sum(OPS)),
+        columns,
+        Some(Column::sum([
+            COL_MAP.op.binary_op,
+            COL_MAP.op.fp254_op,
+            COL_MAP.op.ternary_op,
+        ])),
     )
 }
 
 pub fn ctl_arithmetic_shift_rows<F: Field>() -> TableWithColumns<F> {
-    const OPS: [usize; 14] = [
-        COL_MAP.op.add,
-        COL_MAP.op.sub,
-        // SHL is interpreted as MUL on the arithmetic side
-        COL_MAP.op.shl,
-        COL_MAP.op.lt,
-        COL_MAP.op.gt,
-        COL_MAP.op.addfp254,
-        COL_MAP.op.mulfp254,
-        COL_MAP.op.subfp254,
-        COL_MAP.op.addmod,
-        COL_MAP.op.mulmod,
-        COL_MAP.op.submod,
-        // SHR is interpreted as DIV on the arithmetic side
-        COL_MAP.op.shr,
-        COL_MAP.op.mod_,
-        COL_MAP.op.byte,
-    ];
+    // Instead of taking single columns, we reconstruct the entire opcode value directly.
+    let mut columns = vec![Column::le_bits(COL_MAP.opcode_bits)];
+    columns.extend(ctl_data_ternops(true));
     // Create the CPU Table whose columns are those with the three
     // inputs and one output of the ternary operations listed in `ops`
     // (also `ops` is used as the operation filter). The list of
     // operations includes binary operations which will simply ignore
     // the third input.
-    TableWithColumns::new(
-        Table::Cpu,
-        ctl_data_ternops(&OPS, true),
-        Some(Column::sum([COL_MAP.op.shl, COL_MAP.op.shr])),
-    )
+    TableWithColumns::new(Table::Cpu, columns, Some(Column::single(COL_MAP.op.shift)))
 }
 
 pub fn ctl_data_byte_packing<F: Field>() -> Vec<Column<F>> {
@@ -274,15 +245,16 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for CpuStark<F, D
         decode::eval_packed_generic(local_values, yield_constr);
         dup_swap::eval_packed(local_values, yield_constr);
         gas::eval_packed(local_values, next_values, yield_constr);
+        halt::eval_packed(local_values, next_values, yield_constr);
         jumps::eval_packed(local_values, next_values, yield_constr);
         membus::eval_packed(local_values, yield_constr);
-        memio::eval_packed(local_values, yield_constr);
+        memio::eval_packed(local_values, next_values, yield_constr);
         modfp254::eval_packed(local_values, yield_constr);
         pc::eval_packed(local_values, yield_constr);
         push0::eval_packed(local_values, yield_constr);
         shift::eval_packed(local_values, yield_constr);
-        simple_logic::eval_packed(local_values, yield_constr);
-        stack::eval_packed(local_values, yield_constr);
+        simple_logic::eval_packed(local_values, next_values, yield_constr);
+        stack::eval_packed(local_values, next_values, yield_constr);
         stack_bounds::eval_packed(local_values, yield_constr);
         syscalls_exceptions::eval_packed(local_values, next_values, yield_constr);
     }
@@ -301,15 +273,16 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for CpuStark<F, D
         decode::eval_ext_circuit(builder, local_values, yield_constr);
         dup_swap::eval_ext_circuit(builder, local_values, yield_constr);
         gas::eval_ext_circuit(builder, local_values, next_values, yield_constr);
+        halt::eval_ext_circuit(builder, local_values, next_values, yield_constr);
         jumps::eval_ext_circuit(builder, local_values, next_values, yield_constr);
         membus::eval_ext_circuit(builder, local_values, yield_constr);
-        memio::eval_ext_circuit(builder, local_values, yield_constr);
+        memio::eval_ext_circuit(builder, local_values, next_values, yield_constr);
         modfp254::eval_ext_circuit(builder, local_values, yield_constr);
         pc::eval_ext_circuit(builder, local_values, yield_constr);
         push0::eval_ext_circuit(builder, local_values, yield_constr);
         shift::eval_ext_circuit(builder, local_values, yield_constr);
-        simple_logic::eval_ext_circuit(builder, local_values, yield_constr);
-        stack::eval_ext_circuit(builder, local_values, yield_constr);
+        simple_logic::eval_ext_circuit(builder, local_values, next_values, yield_constr);
+        stack::eval_ext_circuit(builder, local_values, next_values, yield_constr);
         stack_bounds::eval_ext_circuit(builder, local_values, yield_constr);
         syscalls_exceptions::eval_ext_circuit(builder, local_values, next_values, yield_constr);
     }
diff --git a/evm/src/cpu/decode.rs b/evm/src/cpu/decode.rs
index 9a9c572387..c1c43a0bb1 100644
--- a/evm/src/cpu/decode.rs
+++ b/evm/src/cpu/decode.rs
@@ -1,5 +1,6 @@
 use plonky2::field::extension::Extendable;
 use plonky2::field::packed::PackedField;
+use plonky2::field::types::Field;
 use plonky2::hash::hash_types::RichField;
 use plonky2::iop::ext_target::ExtensionTarget;
 
@@ -22,26 +23,15 @@ use crate::cpu::columns::{CpuColumnsView, COL_MAP};
 /// behavior.
 /// Note: invalid opcodes are not represented here. _Any_ opcode is permitted to decode to
 /// `is_invalid`. The kernel then verifies that the opcode was _actually_ invalid.
-const OPCODES: [(u8, usize, bool, usize); 33] = [
+const OPCODES: [(u8, usize, bool, usize); 16] = [
     // (start index of block, number of top bits to check (log2), kernel-only, flag column)
-    (0x01, 0, false, COL_MAP.op.add),
-    (0x02, 0, false, COL_MAP.op.mul),
-    (0x03, 0, false, COL_MAP.op.sub),
-    (0x04, 0, false, COL_MAP.op.div),
-    (0x06, 0, false, COL_MAP.op.mod_),
-    (0x08, 0, false, COL_MAP.op.addmod),
-    (0x09, 0, false, COL_MAP.op.mulmod),
-    (0x0c, 0, true, COL_MAP.op.addfp254),
-    (0x0d, 0, true, COL_MAP.op.mulfp254),
-    (0x0e, 0, true, COL_MAP.op.subfp254),
-    (0x10, 0, false, COL_MAP.op.lt),
-    (0x11, 0, false, COL_MAP.op.gt),
+    // ADD, MUL, SUB, DIV, MOD, LT, GT and BYTE flags are handled partly manually here, and partly through the Arithmetic table CTL.
+    // ADDMOD, MULMOD and SUBMOD flags are handled partly manually here, and partly through the Arithmetic table CTL.
+    // FP254 operation flags are handled partly manually here, and partly through the Arithmetic table CTL.
     (0x14, 1, false, COL_MAP.op.eq_iszero),
     // AND, OR and XOR flags are handled partly manually here, and partly through the Logic table CTL.
     (0x19, 0, false, COL_MAP.op.not),
-    (0x1a, 0, false, COL_MAP.op.byte),
-    (0x1b, 0, false, COL_MAP.op.shl),
-    (0x1c, 0, false, COL_MAP.op.shr),
+    // SHL and SHR flags are handled partly manually here, and partly through the Logic table CTL.
     (0x21, 0, true, COL_MAP.op.keccak_general),
     (0x49, 0, true, COL_MAP.op.prover_input),
     (0x50, 0, false, COL_MAP.op.pop),
@@ -56,8 +46,19 @@ const OPCODES: [(u8, usize, bool, usize); 33] = [
     (0xf6, 1, true, COL_MAP.op.context_op), // 0xf6-0xf7
     (0xf8, 0, true, COL_MAP.op.mload_32bytes),
     (0xf9, 0, true, COL_MAP.op.exit_kernel),
-    (0xfb, 0, true, COL_MAP.op.mload_general),
-    (0xfc, 0, true, COL_MAP.op.mstore_general),
+    // MLOAD_GENERAL and MSTORE_GENERAL flags are handled manually here.
+];
+
+/// List of combined opcodes requiring a special handling.
+/// Each index in the list corresponds to an arbitrary combination
+/// of opcodes defined in evm/src/cpu/columns/ops.rs.
+const COMBINED_OPCODES: [usize; 6] = [
+    COL_MAP.op.logic_op,
+    COL_MAP.op.fp254_op,
+    COL_MAP.op.binary_op,
+    COL_MAP.op.ternary_op,
+    COL_MAP.op.shift,
+    COL_MAP.op.m_op_general,
 ];
 
 pub fn generate<F: RichField>(lv: &mut CpuColumnsView<F>) {
@@ -99,6 +100,10 @@ pub fn generate<F: RichField>(lv: &mut CpuColumnsView<F>) {
         let flag = available && opcode_match;
         lv[col] = F::from_bool(flag);
     }
+
+    if opcode == 0xfb || opcode == 0xfc {
+        lv.op.m_op_general = F::from_bool(kernel);
+    }
 }
 
 /// Break up an opcode (which is 8 bits long) into its eight bits.
@@ -134,17 +139,17 @@ pub fn eval_packed_generic<P: PackedField>(
         let flag = lv[flag_col];
         yield_constr.constraint(flag * (flag - P::ONES));
     }
-    // Manually check the logic_op flag combining AND, OR and XOR.
-    let flag = lv.op.logic_op;
-    yield_constr.constraint(flag * (flag - P::ONES));
+    // Also check that the combined instruction flags are valid.
+    for flag_idx in COMBINED_OPCODES {
+        yield_constr.constraint(lv[flag_idx] * (lv[flag_idx] - P::ONES));
+    }
 
-    // Now check that they sum to 0 or 1.
-    // Includes the logic_op flag encompassing AND, OR and XOR opcodes.
+    // Now check that they sum to 0 or 1, including the combined flags.
     let flag_sum: P = OPCODES
         .into_iter()
         .map(|(_, _, _, flag_col)| lv[flag_col])
-        .sum::<P>()
-        + lv.op.logic_op;
+        .chain(COMBINED_OPCODES.map(|op| lv[op]))
+        .sum::<P>();
     yield_constr.constraint(flag_sum * (flag_sum - P::ONES));
 
     // Finally, classify all opcodes, together with the kernel flag, into blocks
@@ -173,6 +178,20 @@ pub fn eval_packed_generic<P: PackedField>(
         // correct mode.
         yield_constr.constraint(lv[col] * (unavailable + opcode_mismatch));
     }
+
+    // Manually check lv.op.m_op_constr
+    let opcode: P = lv
+        .opcode_bits
+        .into_iter()
+        .enumerate()
+        .map(|(i, bit)| bit * P::Scalar::from_canonical_u64(1 << i))
+        .sum();
+    yield_constr.constraint((P::ONES - kernel_mode) * lv.op.m_op_general);
+
+    let m_op_constr = (opcode - P::Scalar::from_canonical_usize(0xfb_usize))
+        * (opcode - P::Scalar::from_canonical_usize(0xfc_usize))
+        * lv.op.m_op_general;
+    yield_constr.constraint(m_op_constr);
 }
 
 pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
@@ -204,15 +223,16 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         let constr = builder.mul_sub_extension(flag, flag, flag);
         yield_constr.constraint(builder, constr);
     }
-    // Manually check the logic_op flag combining AND, OR and XOR.
-    let flag = lv.op.logic_op;
-    let constr = builder.mul_sub_extension(flag, flag, flag);
-    yield_constr.constraint(builder, constr);
+    // Also check that the combined instruction flags are valid.
+    for flag_idx in COMBINED_OPCODES {
+        let constr = builder.mul_sub_extension(lv[flag_idx], lv[flag_idx], lv[flag_idx]);
+        yield_constr.constraint(builder, constr);
+    }
 
-    // Now check that they sum to 0 or 1.
-    // Includes the logic_op flag encompassing AND, OR and XOR opcodes.
+    // Now check that they sum to 0 or 1, including the combined flags.
     {
-        let mut flag_sum = lv.op.logic_op;
+        let mut flag_sum =
+            builder.add_many_extension(COMBINED_OPCODES.into_iter().map(|idx| lv[idx]));
         for (_, _, _, flag_col) in OPCODES {
             let flag = lv[flag_col];
             flag_sum = builder.add_extension(flag_sum, flag);
@@ -250,4 +270,28 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
         let constr = builder.mul_extension(lv[col], constr);
         yield_constr.constraint(builder, constr);
     }
+
+    // Manually check lv.op.m_op_constr
+    let opcode = lv
+        .opcode_bits
+        .into_iter()
+        .rev()
+        .fold(builder.zero_extension(), |cumul, bit| {
+            builder.mul_const_add_extension(F::TWO, cumul, bit)
+        });
+
+    let mload_opcode = builder.constant_extension(F::Extension::from_canonical_usize(0xfb_usize));
+    let mstore_opcode = builder.constant_extension(F::Extension::from_canonical_usize(0xfc_usize));
+
+    let one_extension = builder.constant_extension(F::Extension::ONE);
+    let is_not_kernel_mode = builder.sub_extension(one_extension, kernel_mode);
+    let constr = builder.mul_extension(is_not_kernel_mode, lv.op.m_op_general);
+    yield_constr.constraint(builder, constr);
+
+    let mload_constr = builder.sub_extension(opcode, mload_opcode);
+    let mstore_constr = builder.sub_extension(opcode, mstore_opcode);
+    let mut m_op_constr = builder.mul_extension(mload_constr, mstore_constr);
+    m_op_constr = builder.mul_extension(m_op_constr, lv.op.m_op_general);
+
+    yield_constr.constraint(builder, m_op_constr);
 }
diff --git a/evm/src/cpu/gas.rs b/evm/src/cpu/gas.rs
index e967c07ece..51f375c056 100644
--- a/evm/src/cpu/gas.rs
+++ b/evm/src/cpu/gas.rs
@@ -19,25 +19,13 @@ const G_MID: Option<u32> = Some(8);
 const G_HIGH: Option<u32> = Some(10);
 
 const SIMPLE_OPCODES: OpsColumnsView<Option<u32>> = OpsColumnsView {
-    add: G_VERYLOW,
-    mul: G_LOW,
-    sub: G_VERYLOW,
-    div: G_LOW,
-    mod_: G_LOW,
-    addmod: G_MID,
-    mulmod: G_MID,
-    addfp254: KERNEL_ONLY_INSTR,
-    mulfp254: KERNEL_ONLY_INSTR,
-    subfp254: KERNEL_ONLY_INSTR,
-    submod: KERNEL_ONLY_INSTR,
-    lt: G_VERYLOW,
-    gt: G_VERYLOW,
+    binary_op: None,  // This is handled manually below
+    ternary_op: None, // This is handled manually below
+    fp254_op: KERNEL_ONLY_INSTR,
     eq_iszero: G_VERYLOW,
     logic_op: G_VERYLOW,
     not: G_VERYLOW,
-    byte: G_VERYLOW,
-    shl: G_VERYLOW,
-    shr: G_VERYLOW,
+    shift: G_VERYLOW,
     keccak_general: KERNEL_ONLY_INSTR,
     prover_input: KERNEL_ONLY_INSTR,
     pop: G_BASE,
@@ -52,8 +40,7 @@ const SIMPLE_OPCODES: OpsColumnsView<Option<u32>> = OpsColumnsView {
     mstore_32bytes: KERNEL_ONLY_INSTR,
     mload_32bytes: KERNEL_ONLY_INSTR,
     exit_kernel: None,
-    mload_general: KERNEL_ONLY_INSTR,
-    mstore_general: KERNEL_ONLY_INSTR,
+    m_op_general: KERNEL_ONLY_INSTR,
     syscall: None,
     exception: None,
 };
@@ -97,6 +84,21 @@ fn eval_packed_accumulate<P: PackedField>(
     let jump_gas_cost = P::Scalar::from_canonical_u32(G_MID.unwrap())
         + lv.opcode_bits[0] * P::Scalar::from_canonical_u32(G_HIGH.unwrap() - G_MID.unwrap());
     yield_constr.constraint_transition(lv.op.jumps * (nv.gas - lv.gas - jump_gas_cost));
+
+    // For binary_ops.
+    // MUL, DIV and MOD are differentiated from ADD, SUB, LT, GT and BYTE by their first and fifth bits set to 0.
+    let cost_filter = lv.opcode_bits[0] + lv.opcode_bits[4] - lv.opcode_bits[0] * lv.opcode_bits[4];
+    let binary_op_cost = P::Scalar::from_canonical_u32(G_LOW.unwrap())
+        + cost_filter
+            * (P::Scalar::from_canonical_u32(G_VERYLOW.unwrap())
+                - P::Scalar::from_canonical_u32(G_LOW.unwrap()));
+    yield_constr.constraint_transition(lv.op.binary_op * (nv.gas - lv.gas - binary_op_cost));
+
+    // For ternary_ops.
+    // SUBMOD is differentiated by its second bit set to 1.
+    let ternary_op_cost = P::Scalar::from_canonical_u32(G_MID.unwrap())
+        - lv.opcode_bits[1] * P::Scalar::from_canonical_u32(G_MID.unwrap());
+    yield_constr.constraint_transition(lv.op.ternary_op * (nv.gas - lv.gas - ternary_op_cost));
 }
 
 fn eval_packed_init<P: PackedField>(
@@ -186,6 +188,41 @@ fn eval_ext_circuit_accumulate<F: RichField + Extendable<D>, const D: usize>(
     let gas_diff = builder.sub_extension(nv_lv_diff, jump_gas_cost);
     let constr = builder.mul_extension(filter, gas_diff);
     yield_constr.constraint_transition(builder, constr);
+
+    // For binary_ops.
+    // MUL, DIV and MOD are differentiated from ADD, SUB, LT, GT and BYTE by their first and fifth bits set to 0.
+    let filter = lv.op.binary_op;
+    let cost_filter = {
+        let a = builder.add_extension(lv.opcode_bits[0], lv.opcode_bits[4]);
+        let b = builder.mul_extension(lv.opcode_bits[0], lv.opcode_bits[4]);
+        builder.sub_extension(a, b)
+    };
+    let binary_op_cost = builder.mul_const_extension(
+        F::from_canonical_u32(G_VERYLOW.unwrap()) - F::from_canonical_u32(G_LOW.unwrap()),
+        cost_filter,
+    );
+    let binary_op_cost =
+        builder.add_const_extension(binary_op_cost, F::from_canonical_u32(G_LOW.unwrap()));
+
+    let nv_lv_diff = builder.sub_extension(nv.gas, lv.gas);
+    let gas_diff = builder.sub_extension(nv_lv_diff, binary_op_cost);
+    let constr = builder.mul_extension(filter, gas_diff);
+    yield_constr.constraint_transition(builder, constr);
+
+    // For ternary_ops.
+    // SUBMOD is differentiated by its second bit set to 1.
+    let filter = lv.op.ternary_op;
+    let ternary_op_cost = builder.mul_const_extension(
+        F::from_canonical_u32(G_MID.unwrap()).neg(),
+        lv.opcode_bits[1],
+    );
+    let ternary_op_cost =
+        builder.add_const_extension(ternary_op_cost, F::from_canonical_u32(G_MID.unwrap()));
+
+    let nv_lv_diff = builder.sub_extension(nv.gas, lv.gas);
+    let gas_diff = builder.sub_extension(nv_lv_diff, ternary_op_cost);
+    let constr = builder.mul_extension(filter, gas_diff);
+    yield_constr.constraint_transition(builder, constr);
 }
 
 fn eval_ext_circuit_init<F: RichField + Extendable<D>, const D: usize>(
diff --git a/evm/src/cpu/halt.rs b/evm/src/cpu/halt.rs
new file mode 100644
index 0000000000..9ad34344ea
--- /dev/null
+++ b/evm/src/cpu/halt.rs
@@ -0,0 +1,98 @@
+//! Once the CPU execution is over (i.e. reached the `halt` label in the kernel),
+//! the CPU trace will be padded with special dummy rows, incurring no memory overhead.
+
+use plonky2::field::extension::Extendable;
+use plonky2::field::packed::PackedField;
+use plonky2::hash::hash_types::RichField;
+use plonky2::iop::ext_target::ExtensionTarget;
+
+use super::control_flow::get_halt_pc;
+use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
+use crate::cpu::columns::{CpuColumnsView, COL_MAP};
+use crate::cpu::membus::NUM_GP_CHANNELS;
+
+pub fn eval_packed<P: PackedField>(
+    lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
+    yield_constr: &mut ConstraintConsumer<P>,
+) {
+    let is_cpu_cycle: P = COL_MAP.op.iter().map(|&col_i| lv[col_i]).sum();
+    let is_cpu_cycle_next: P = COL_MAP.op.iter().map(|&col_i| nv[col_i]).sum();
+
+    let halt_state = P::ONES - lv.is_bootstrap_kernel - is_cpu_cycle;
+    let next_halt_state = P::ONES - nv.is_bootstrap_kernel - is_cpu_cycle_next;
+
+    // The halt flag must be boolean.
+    yield_constr.constraint(halt_state * (halt_state - P::ONES));
+    // Once we reach a padding row, there must be only padding rows.
+    yield_constr.constraint_transition(halt_state * (next_halt_state - P::ONES));
+
+    // Padding rows should have their memory channels disabled.
+    for i in 0..NUM_GP_CHANNELS {
+        let channel = lv.mem_channels[i];
+        yield_constr.constraint(halt_state * channel.used);
+    }
+
+    // The last row must be a dummy padding row.
+    yield_constr.constraint_last_row(halt_state - P::ONES);
+
+    // Also, a padding row's `program_counter` must be at the `halt` label.
+    // In particular, it ensures that the first padding row may only be added
+    // after we jumped to the `halt` function. Subsequent padding rows may set
+    // the `program_counter` to arbitrary values (there's no transition
+    // constraints) so we can place this requirement on them too.
+    let halt_pc = get_halt_pc::<P::Scalar>();
+    yield_constr.constraint(halt_state * (lv.program_counter - halt_pc));
+}
+
+pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
+    builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
+    lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
+    yield_constr: &mut RecursiveConstraintConsumer<F, D>,
+) {
+    let one = builder.one_extension();
+
+    let is_cpu_cycle = builder.add_many_extension(COL_MAP.op.iter().map(|&col_i| lv[col_i]));
+    let is_cpu_cycle_next = builder.add_many_extension(COL_MAP.op.iter().map(|&col_i| nv[col_i]));
+
+    let halt_state = builder.add_extension(lv.is_bootstrap_kernel, is_cpu_cycle);
+    let halt_state = builder.sub_extension(one, halt_state);
+    let next_halt_state = builder.add_extension(nv.is_bootstrap_kernel, is_cpu_cycle_next);
+    let next_halt_state = builder.sub_extension(one, next_halt_state);
+
+    // The halt flag must be boolean.
+    let constr = builder.mul_sub_extension(halt_state, halt_state, halt_state);
+    yield_constr.constraint(builder, constr);
+    // Once we reach a padding row, there must be only padding rows.
+    let constr = builder.mul_sub_extension(halt_state, next_halt_state, halt_state);
+    yield_constr.constraint_transition(builder, constr);
+
+    // Padding rows should have their memory channels disabled.
+    for i in 0..NUM_GP_CHANNELS {
+        let channel = lv.mem_channels[i];
+        let constr = builder.mul_extension(halt_state, channel.used);
+        yield_constr.constraint(builder, constr);
+    }
+
+    // The last row must be a dummy padding row.
+    {
+        let one = builder.one_extension();
+        let constr = builder.sub_extension(halt_state, one);
+        yield_constr.constraint_last_row(builder, constr);
+    }
+
+    // Also, a padding row's `program_counter` must be at the `halt` label.
+    // In particular, it ensures that the first padding row may only be added
+    // after we jumped to the `halt` function. Subsequent padding rows may set
+    // the `program_counter` to arbitrary values (there's no transition
+    // constraints) so we can place this requirement on them too.
+    {
+        let halt_pc = get_halt_pc();
+        let halt_pc_target = builder.constant_extension(halt_pc);
+        let constr = builder.sub_extension(lv.program_counter, halt_pc_target);
+        let constr = builder.mul_extension(halt_state, constr);
+
+        yield_constr.constraint(builder, constr);
+    }
+}
diff --git a/evm/src/cpu/jumps.rs b/evm/src/cpu/jumps.rs
index a3c38a90a6..62d9bdfd25 100644
--- a/evm/src/cpu/jumps.rs
+++ b/evm/src/cpu/jumps.rs
@@ -75,8 +75,8 @@ pub fn eval_packed_jump_jumpi<P: PackedField>(
     let is_jumpi = filter * lv.opcode_bits[0];
 
     // Stack constraints.
-    stack::eval_packed_one(lv, is_jump, stack::JUMP_OP.unwrap(), yield_constr);
-    stack::eval_packed_one(lv, is_jumpi, stack::JUMPI_OP.unwrap(), yield_constr);
+    stack::eval_packed_one(lv, nv, is_jump, stack::JUMP_OP.unwrap(), yield_constr);
+    stack::eval_packed_one(lv, nv, is_jumpi, stack::JUMPI_OP.unwrap(), yield_constr);
 
     // If `JUMP`, re-use the `JUMPI` logic, but setting the second input (the predicate) to be 1.
     // In other words, we implement `JUMP(dst)` as `JUMPI(dst, cond=1)`.
@@ -151,10 +151,18 @@ pub fn eval_ext_circuit_jump_jumpi<F: RichField + Extendable<D>, const D: usize>
     let is_jumpi = builder.mul_extension(filter, lv.opcode_bits[0]);
 
     // Stack constraints.
-    stack::eval_ext_circuit_one(builder, lv, is_jump, stack::JUMP_OP.unwrap(), yield_constr);
     stack::eval_ext_circuit_one(
         builder,
         lv,
+        nv,
+        is_jump,
+        stack::JUMP_OP.unwrap(),
+        yield_constr,
+    );
+    stack::eval_ext_circuit_one(
+        builder,
+        lv,
+        nv,
         is_jumpi,
         stack::JUMPI_OP.unwrap(),
         yield_constr,
diff --git a/evm/src/cpu/kernel/asm/halt.asm b/evm/src/cpu/kernel/asm/halt.asm
index 906ce51aaa..49561fd660 100644
--- a/evm/src/cpu/kernel/asm/halt.asm
+++ b/evm/src/cpu/kernel/asm/halt.asm
@@ -1,6 +1,2 @@
 global halt:
-    PUSH halt_pc0
-global halt_pc0:
-    DUP1
-global halt_pc1:
-    JUMP
+    PANIC
diff --git a/evm/src/cpu/kernel/asm/memory/packing.asm b/evm/src/cpu/kernel/asm/memory/packing.asm
index 81ab31236e..1dbbf39362 100644
--- a/evm/src/cpu/kernel/asm/memory/packing.asm
+++ b/evm/src/cpu/kernel/asm/memory/packing.asm
@@ -42,10 +42,10 @@ global mload_packing_u64_LE:
 // Post stack: offset'
 global mstore_unpacking:
     // stack: context, segment, offset, value, len, retdest
-    %stack(context, segment, offset, value, len, retdest) -> (context, segment, offset, value, len, len, offset, retdest)
-    // stack: context, segment, offset, value, len, len, offset, retdest
+    %stack(context, segment, offset, value, len, retdest) -> (context, segment, offset, value, len, offset, len, retdest)
+    // stack: context, segment, offset, value, len, offset, len, retdest
     MSTORE_32BYTES
-    // stack: len, offset, retdest
+    // stack: offset, len, retdest
     ADD SWAP1
     // stack: retdest, offset'
     JUMP
diff --git a/evm/src/cpu/memio.rs b/evm/src/cpu/memio.rs
index 09490e87e4..aa3749cab2 100644
--- a/evm/src/cpu/memio.rs
+++ b/evm/src/cpu/memio.rs
@@ -7,6 +7,7 @@ use plonky2::iop::ext_target::ExtensionTarget;
 use crate::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsumer};
 use crate::cpu::columns::CpuColumnsView;
 use crate::cpu::membus::NUM_GP_CHANNELS;
+use crate::cpu::stack;
 
 fn get_addr<T: Copy>(lv: &CpuColumnsView<T>) -> (T, T, T) {
     let addr_context = lv.mem_channels[0].value[0];
@@ -17,9 +18,11 @@ fn get_addr<T: Copy>(lv: &CpuColumnsView<T>) -> (T, T, T) {
 
 fn eval_packed_load<P: PackedField>(
     lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    let filter = lv.op.mload_general;
+    // The opcode for MLOAD_GENERAL is 0xfb. If the operation is MLOAD_GENERAL, lv.opcode_bits[0] = 1
+    let filter = lv.op.m_op_general * lv.opcode_bits[0];
 
     let (addr_context, addr_segment, addr_virtual) = get_addr(lv);
 
@@ -38,14 +41,25 @@ fn eval_packed_load<P: PackedField>(
     for &channel in &lv.mem_channels[4..NUM_GP_CHANNELS - 1] {
         yield_constr.constraint(filter * channel.used);
     }
+
+    // Stack constraints
+    stack::eval_packed_one(
+        lv,
+        nv,
+        filter,
+        stack::MLOAD_GENERAL_OP.unwrap(),
+        yield_constr,
+    );
 }
 
 fn eval_ext_circuit_load<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let filter = lv.op.mload_general;
+    let mut filter = lv.op.m_op_general;
+    filter = builder.mul_extension(filter, lv.opcode_bits[0]);
 
     let (addr_context, addr_segment, addr_virtual) = get_addr(lv);
 
@@ -82,13 +96,24 @@ fn eval_ext_circuit_load<F: RichField + Extendable<D>, const D: usize>(
         let constr = builder.mul_extension(filter, channel.used);
         yield_constr.constraint(builder, constr);
     }
+
+    // Stack constraints
+    stack::eval_ext_circuit_one(
+        builder,
+        lv,
+        nv,
+        filter,
+        stack::MLOAD_GENERAL_OP.unwrap(),
+        yield_constr,
+    );
 }
 
 fn eval_packed_store<P: PackedField>(
     lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    let filter = lv.op.mstore_general;
+    let filter = lv.op.m_op_general * (P::ONES - lv.opcode_bits[0]);
 
     let (addr_context, addr_segment, addr_virtual) = get_addr(lv);
 
@@ -107,14 +132,27 @@ fn eval_packed_store<P: PackedField>(
     for &channel in &lv.mem_channels[5..] {
         yield_constr.constraint(filter * channel.used);
     }
+
+    // Stack constraints
+    stack::eval_packed_one(
+        lv,
+        nv,
+        filter,
+        stack::MSTORE_GENERAL_OP.unwrap(),
+        yield_constr,
+    );
 }
 
 fn eval_ext_circuit_store<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let filter = lv.op.mstore_general;
+    let mut filter = lv.op.m_op_general;
+    let one = builder.one_extension();
+    let minus = builder.sub_extension(one, lv.opcode_bits[0]);
+    filter = builder.mul_extension(filter, minus);
 
     let (addr_context, addr_segment, addr_virtual) = get_addr(lv);
 
@@ -151,21 +189,33 @@ fn eval_ext_circuit_store<F: RichField + Extendable<D>, const D: usize>(
         let constr = builder.mul_extension(filter, channel.used);
         yield_constr.constraint(builder, constr);
     }
+
+    // Stack constraints
+    stack::eval_ext_circuit_one(
+        builder,
+        lv,
+        nv,
+        filter,
+        stack::MSTORE_GENERAL_OP.unwrap(),
+        yield_constr,
+    );
 }
 
 pub fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    eval_packed_load(lv, yield_constr);
-    eval_packed_store(lv, yield_constr);
+    eval_packed_load(lv, nv, yield_constr);
+    eval_packed_store(lv, nv, yield_constr);
 }
 
 pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    eval_ext_circuit_load(builder, lv, yield_constr);
-    eval_ext_circuit_store(builder, lv, yield_constr);
+    eval_ext_circuit_load(builder, lv, nv, yield_constr);
+    eval_ext_circuit_store(builder, lv, nv, yield_constr);
 }
diff --git a/evm/src/cpu/mod.rs b/evm/src/cpu/mod.rs
index 91b04cf487..b7312147b4 100644
--- a/evm/src/cpu/mod.rs
+++ b/evm/src/cpu/mod.rs
@@ -6,6 +6,7 @@ pub mod cpu_stark;
 pub(crate) mod decode;
 mod dup_swap;
 mod gas;
+mod halt;
 mod jumps;
 pub mod kernel;
 pub(crate) mod membus;
diff --git a/evm/src/cpu/modfp254.rs b/evm/src/cpu/modfp254.rs
index e6a2815d19..86f08052ef 100644
--- a/evm/src/cpu/modfp254.rs
+++ b/evm/src/cpu/modfp254.rs
@@ -19,7 +19,7 @@ pub fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    let filter = lv.op.addfp254 + lv.op.mulfp254 + lv.op.subfp254;
+    let filter = lv.op.fp254_op;
 
     // We want to use all the same logic as the usual mod operations, but without needing to read
     // the modulus from the stack. We simply constrain `mem_channels[2]` to be our prime (that's
@@ -36,7 +36,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let filter = builder.add_many_extension([lv.op.addfp254, lv.op.mulfp254, lv.op.subfp254]);
+    let filter = lv.op.fp254_op;
 
     // We want to use all the same logic as the usual mod operations, but without needing to read
     // the modulus from the stack. We simply constrain `mem_channels[2]` to be our prime (that's
diff --git a/evm/src/cpu/shift.rs b/evm/src/cpu/shift.rs
index a8acf5d482..a424929798 100644
--- a/evm/src/cpu/shift.rs
+++ b/evm/src/cpu/shift.rs
@@ -13,7 +13,7 @@ pub(crate) fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
-    let is_shift = lv.op.shl + lv.op.shr;
+    let is_shift = lv.op.shift;
     let displacement = lv.mem_channels[0]; // holds the shift displacement d
     let two_exp = lv.mem_channels[2]; // holds 2^d
 
@@ -64,7 +64,7 @@ pub(crate) fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     lv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
-    let is_shift = builder.add_extension(lv.op.shl, lv.op.shr);
+    let is_shift = lv.op.shift;
     let displacement = lv.mem_channels[0];
     let two_exp = lv.mem_channels[2];
 
diff --git a/evm/src/cpu/simple_logic/eq_iszero.rs b/evm/src/cpu/simple_logic/eq_iszero.rs
index f16901f58f..7be021caa6 100644
--- a/evm/src/cpu/simple_logic/eq_iszero.rs
+++ b/evm/src/cpu/simple_logic/eq_iszero.rs
@@ -51,6 +51,7 @@ pub fn generate_pinv_diff<F: Field>(val0: U256, val1: U256, lv: &mut CpuColumnsV
 
 pub fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
     let logic = lv.general.logic();
@@ -94,9 +95,10 @@ pub fn eval_packed<P: PackedField>(
     yield_constr.constraint(eq_or_iszero_filter * (dot - unequal));
 
     // Stack constraints.
-    stack::eval_packed_one(lv, eq_filter, EQ_STACK_BEHAVIOR.unwrap(), yield_constr);
+    stack::eval_packed_one(lv, nv, eq_filter, EQ_STACK_BEHAVIOR.unwrap(), yield_constr);
     stack::eval_packed_one(
         lv,
+        nv,
         iszero_filter,
         IS_ZERO_STACK_BEHAVIOR.unwrap(),
         yield_constr,
@@ -106,6 +108,7 @@ pub fn eval_packed<P: PackedField>(
 pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
     let zero = builder.zero_extension();
@@ -173,6 +176,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     stack::eval_ext_circuit_one(
         builder,
         lv,
+        nv,
         eq_filter,
         EQ_STACK_BEHAVIOR.unwrap(),
         yield_constr,
@@ -180,6 +184,7 @@ pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     stack::eval_ext_circuit_one(
         builder,
         lv,
+        nv,
         iszero_filter,
         IS_ZERO_STACK_BEHAVIOR.unwrap(),
         yield_constr,
diff --git a/evm/src/cpu/simple_logic/mod.rs b/evm/src/cpu/simple_logic/mod.rs
index 03d2dd1584..9b4e60b016 100644
--- a/evm/src/cpu/simple_logic/mod.rs
+++ b/evm/src/cpu/simple_logic/mod.rs
@@ -11,17 +11,19 @@ use crate::cpu::columns::CpuColumnsView;
 
 pub fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
     not::eval_packed(lv, yield_constr);
-    eq_iszero::eval_packed(lv, yield_constr);
+    eq_iszero::eval_packed(lv, nv, yield_constr);
 }
 
 pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
     not::eval_ext_circuit(builder, lv, yield_constr);
-    eq_iszero::eval_ext_circuit(builder, lv, yield_constr);
+    eq_iszero::eval_ext_circuit(builder, lv, nv, yield_constr);
 }
diff --git a/evm/src/cpu/stack.rs b/evm/src/cpu/stack.rs
index cfeaa1b0b5..28abf077cb 100644
--- a/evm/src/cpu/stack.rs
+++ b/evm/src/cpu/stack.rs
@@ -44,35 +44,31 @@ pub(crate) const JUMPI_OP: Option<StackBehavior> = Some(StackBehavior {
     disable_other_channels: false,
 });
 
+pub(crate) const MLOAD_GENERAL_OP: Option<StackBehavior> = Some(StackBehavior {
+    num_pops: 3,
+    pushes: true,
+    disable_other_channels: false,
+});
+
+pub(crate) const MSTORE_GENERAL_OP: Option<StackBehavior> = Some(StackBehavior {
+    num_pops: 4,
+    pushes: false,
+    disable_other_channels: false,
+});
+
 // AUDITORS: If the value below is `None`, then the operation must be manually checked to ensure
 // that every general-purpose memory channel is either disabled or has its read flag and address
 // propertly constrained. The same applies  when `disable_other_channels` is set to `false`,
 // except the first `num_pops` and the last `pushes as usize` channels have their read flag and
 // address constrained automatically in this file.
 const STACK_BEHAVIORS: OpsColumnsView<Option<StackBehavior>> = OpsColumnsView {
-    add: BASIC_BINARY_OP,
-    mul: BASIC_BINARY_OP,
-    sub: BASIC_BINARY_OP,
-    div: BASIC_BINARY_OP,
-    mod_: BASIC_BINARY_OP,
-    addmod: BASIC_TERNARY_OP,
-    mulmod: BASIC_TERNARY_OP,
-    addfp254: BASIC_BINARY_OP,
-    mulfp254: BASIC_BINARY_OP,
-    subfp254: BASIC_BINARY_OP,
-    submod: BASIC_TERNARY_OP,
-    lt: BASIC_BINARY_OP,
-    gt: BASIC_BINARY_OP,
+    binary_op: BASIC_BINARY_OP,
+    ternary_op: BASIC_TERNARY_OP,
+    fp254_op: BASIC_BINARY_OP,
     eq_iszero: None, // EQ is binary, IS_ZERO is unary.
     logic_op: BASIC_BINARY_OP,
     not: BASIC_UNARY_OP,
-    byte: BASIC_BINARY_OP,
-    shl: Some(StackBehavior {
-        num_pops: 2,
-        pushes: true,
-        disable_other_channels: false,
-    }),
-    shr: Some(StackBehavior {
+    shift: Some(StackBehavior {
         num_pops: 2,
         pushes: true,
         disable_other_channels: false,
@@ -123,16 +119,7 @@ const STACK_BEHAVIORS: OpsColumnsView<Option<StackBehavior>> = OpsColumnsView {
         pushes: false,
         disable_other_channels: true,
     }),
-    mload_general: Some(StackBehavior {
-        num_pops: 3,
-        pushes: true,
-        disable_other_channels: false,
-    }),
-    mstore_general: Some(StackBehavior {
-        num_pops: 4,
-        pushes: false,
-        disable_other_channels: false,
-    }),
+    m_op_general: None,
     syscall: Some(StackBehavior {
         num_pops: 0,
         pushes: true,
@@ -150,6 +137,7 @@ pub(crate) const IS_ZERO_STACK_BEHAVIOR: Option<StackBehavior> = BASIC_UNARY_OP;
 
 pub(crate) fn eval_packed_one<P: PackedField>(
     lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
     filter: P,
     stack_behavior: StackBehavior,
     yield_constr: &mut ConstraintConsumer<P>,
@@ -195,15 +183,21 @@ pub(crate) fn eval_packed_one<P: PackedField>(
             yield_constr.constraint(filter * channel.used);
         }
     }
+
+    // Constrain new stack length.
+    let num_pops = P::Scalar::from_canonical_usize(stack_behavior.num_pops);
+    let push = P::Scalar::from_canonical_usize(stack_behavior.pushes as usize);
+    yield_constr.constraint_transition(filter * (nv.stack_len - (lv.stack_len - num_pops + push)));
 }
 
 pub fn eval_packed<P: PackedField>(
     lv: &CpuColumnsView<P>,
+    nv: &CpuColumnsView<P>,
     yield_constr: &mut ConstraintConsumer<P>,
 ) {
     for (op, stack_behavior) in izip!(lv.op.into_iter(), STACK_BEHAVIORS.into_iter()) {
         if let Some(stack_behavior) = stack_behavior {
-            eval_packed_one(lv, op, stack_behavior, yield_constr);
+            eval_packed_one(lv, nv, op, stack_behavior, yield_constr);
         }
     }
 }
@@ -211,6 +205,7 @@ pub fn eval_packed<P: PackedField>(
 pub(crate) fn eval_ext_circuit_one<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
     filter: ExtensionTarget<D>,
     stack_behavior: StackBehavior,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
@@ -308,16 +303,27 @@ pub(crate) fn eval_ext_circuit_one<F: RichField + Extendable<D>, const D: usize>
             yield_constr.constraint(builder, constr);
         }
     }
+
+    // Constrain new stack length.
+    let diff = builder.constant_extension(
+        F::Extension::from_canonical_usize(stack_behavior.num_pops)
+            - F::Extension::from_canonical_usize(stack_behavior.pushes as usize),
+    );
+    let diff = builder.sub_extension(lv.stack_len, diff);
+    let diff = builder.sub_extension(nv.stack_len, diff);
+    let constr = builder.mul_extension(filter, diff);
+    yield_constr.constraint_transition(builder, constr);
 }
 
 pub fn eval_ext_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut plonky2::plonk::circuit_builder::CircuitBuilder<F, D>,
     lv: &CpuColumnsView<ExtensionTarget<D>>,
+    nv: &CpuColumnsView<ExtensionTarget<D>>,
     yield_constr: &mut RecursiveConstraintConsumer<F, D>,
 ) {
     for (op, stack_behavior) in izip!(lv.op.into_iter(), STACK_BEHAVIORS.into_iter()) {
         if let Some(stack_behavior) = stack_behavior {
-            eval_ext_circuit_one(builder, lv, op, stack_behavior, yield_constr);
+            eval_ext_circuit_one(builder, lv, nv, op, stack_behavior, yield_constr);
         }
     }
 }
diff --git a/evm/src/cross_table_lookup.rs b/evm/src/cross_table_lookup.rs
index a9b90428ca..315bf42f7b 100644
--- a/evm/src/cross_table_lookup.rs
+++ b/evm/src/cross_table_lookup.rs
@@ -25,6 +25,7 @@ use crate::vars::{StarkEvaluationTargets, StarkEvaluationVars};
 #[derive(Clone, Debug)]
 pub struct Column<F: Field> {
     linear_combination: Vec<(usize, F)>,
+    next_row_linear_combination: Vec<(usize, F)>,
     constant: F,
 }
 
@@ -32,6 +33,7 @@ impl<F: Field> Column<F> {
     pub fn single(c: usize) -> Self {
         Self {
             linear_combination: vec![(c, F::ONE)],
+            next_row_linear_combination: vec![],
             constant: F::ZERO,
         }
     }
@@ -42,9 +44,24 @@ impl<F: Field> Column<F> {
         cs.into_iter().map(|c| Self::single(*c.borrow()))
     }
 
+    pub fn single_next_row(c: usize) -> Self {
+        Self {
+            linear_combination: vec![],
+            next_row_linear_combination: vec![(c, F::ONE)],
+            constant: F::ZERO,
+        }
+    }
+
+    pub fn singles_next_row<I: IntoIterator<Item = impl Borrow<usize>>>(
+        cs: I,
+    ) -> impl Iterator<Item = Self> {
+        cs.into_iter().map(|c| Self::single_next_row(*c.borrow()))
+    }
+
     pub fn constant(constant: F) -> Self {
         Self {
             linear_combination: vec![],
+            next_row_linear_combination: vec![],
             constant,
         }
     }
@@ -70,6 +87,34 @@ impl<F: Field> Column<F> {
         );
         Self {
             linear_combination: v,
+            next_row_linear_combination: vec![],
+            constant,
+        }
+    }
+
+    pub fn linear_combination_and_next_row_with_constant<I: IntoIterator<Item = (usize, F)>>(
+        iter: I,
+        next_row_iter: I,
+        constant: F,
+    ) -> Self {
+        let v = iter.into_iter().collect::<Vec<_>>();
+        let next_row_v = next_row_iter.into_iter().collect::<Vec<_>>();
+
+        assert!(!v.is_empty() || !next_row_v.is_empty());
+        debug_assert_eq!(
+            v.iter().map(|(c, _)| c).unique().count(),
+            v.len(),
+            "Duplicate columns."
+        );
+        debug_assert_eq!(
+            next_row_v.iter().map(|(c, _)| c).unique().count(),
+            next_row_v.len(),
+            "Duplicate columns."
+        );
+
+        Self {
+            linear_combination: v,
+            next_row_linear_combination: next_row_v,
             constant,
         }
     }
@@ -106,13 +151,43 @@ impl<F: Field> Column<F> {
             + FE::from_basefield(self.constant)
     }
 
+    pub fn eval_with_next<FE, P, const D: usize>(&self, v: &[P], next_v: &[P]) -> P
+    where
+        FE: FieldExtension<D, BaseField = F>,
+        P: PackedField<Scalar = FE>,
+    {
+        self.linear_combination
+            .iter()
+            .map(|&(c, f)| v[c] * FE::from_basefield(f))
+            .sum::<P>()
+            + self
+                .next_row_linear_combination
+                .iter()
+                .map(|&(c, f)| next_v[c] * FE::from_basefield(f))
+                .sum::<P>()
+            + FE::from_basefield(self.constant)
+    }
+
     /// Evaluate on an row of a table given in column-major form.
     pub fn eval_table(&self, table: &[PolynomialValues<F>], row: usize) -> F {
-        self.linear_combination
+        let mut res = self
+            .linear_combination
             .iter()
             .map(|&(c, f)| table[c].values[row] * f)
             .sum::<F>()
-            + self.constant
+            + self.constant;
+
+        // If we access the next row at the last row, for sanity, we consider the next row's values to be 0.
+        // If CTLs are correctly written, the filter should be 0 in that case anyway.
+        if !self.next_row_linear_combination.is_empty() && row < table.len() - 1 {
+            res += self
+                .next_row_linear_combination
+                .iter()
+                .map(|&(c, f)| table[c].values[row + 1] * f)
+                .sum::<F>();
+        }
+
+        res
     }
 
     pub fn eval_circuit<const D: usize>(
@@ -136,6 +211,36 @@ impl<F: Field> Column<F> {
         let constant = builder.constant_extension(F::Extension::from_basefield(self.constant));
         builder.inner_product_extension(F::ONE, constant, pairs)
     }
+
+    pub fn eval_with_next_circuit<const D: usize>(
+        &self,
+        builder: &mut CircuitBuilder<F, D>,
+        v: &[ExtensionTarget<D>],
+        next_v: &[ExtensionTarget<D>],
+    ) -> ExtensionTarget<D>
+    where
+        F: RichField + Extendable<D>,
+    {
+        let mut pairs = self
+            .linear_combination
+            .iter()
+            .map(|&(c, f)| {
+                (
+                    v[c],
+                    builder.constant_extension(F::Extension::from_basefield(f)),
+                )
+            })
+            .collect::<Vec<_>>();
+        let next_row_pairs = self.next_row_linear_combination.iter().map(|&(c, f)| {
+            (
+                next_v[c],
+                builder.constant_extension(F::Extension::from_basefield(f)),
+            )
+        });
+        pairs.extend(next_row_pairs);
+        let constant = builder.constant_extension(F::Extension::from_basefield(self.constant));
+        builder.inner_product_extension(F::ONE, constant, pairs)
+    }
 }
 
 #[derive(Clone, Debug)]
@@ -276,7 +381,7 @@ fn partial_products<F: Field>(
     let mut partial_prod = F::ONE;
     let degree = trace[0].len();
     let mut res = Vec::with_capacity(degree);
-    for i in 0..degree {
+    for i in (0..degree).rev() {
         let filter = if let Some(column) = filter_column {
             column.eval_table(trace, i)
         } else {
@@ -293,6 +398,7 @@ fn partial_products<F: Field>(
         };
         res.push(partial_prod);
     }
+    res.reverse();
     res.into()
 }
 
@@ -362,6 +468,10 @@ impl<'a, F: RichField + Extendable<D>, const D: usize>
     }
 }
 
+/// CTL Z partial products are upside down: the complete product is on the first row, and
+/// the first term is on the last row. This allows the transition constraint to be:
+/// Z(w) = Z(gw) * combine(w) where combine is called on the local row
+/// and not the next. This enables CTLs across two rows.
 pub(crate) fn eval_cross_table_lookup_checks<F, FE, P, S, const D: usize, const D2: usize>(
     vars: StarkEvaluationVars<FE, P, { S::COLUMNS }>,
     ctl_vars: &[CtlCheckVars<F, FE, P, D2>],
@@ -380,27 +490,23 @@ pub(crate) fn eval_cross_table_lookup_checks<F, FE, P, S, const D: usize, const
             columns,
             filter_column,
         } = lookup_vars;
-        let combine = |v: &[P]| -> P {
-            let evals = columns.iter().map(|c| c.eval(v)).collect::<Vec<_>>();
-            challenges.combine(evals.iter())
-        };
-        let filter = |v: &[P]| -> P {
-            if let Some(column) = filter_column {
-                column.eval(v)
-            } else {
-                P::ONES
-            }
+
+        let evals = columns
+            .iter()
+            .map(|c| c.eval_with_next(vars.local_values, vars.next_values))
+            .collect::<Vec<_>>();
+        let combined = challenges.combine(evals.iter());
+        let local_filter = if let Some(column) = filter_column {
+            column.eval(vars.local_values)
+        } else {
+            P::ONES
         };
-        let local_filter = filter(vars.local_values);
-        let next_filter = filter(vars.next_values);
-        let select = |filter, x| filter * x + P::ONES - filter;
-
-        // Check value of `Z(1)`
-        consumer.constraint_first_row(*local_z - select(local_filter, combine(vars.local_values)));
-        // Check `Z(gw) = combination * Z(w)`
-        consumer.constraint_transition(
-            *local_z * select(next_filter, combine(vars.next_values)) - *next_z,
-        );
+        let select = local_filter * combined + P::ONES - local_filter;
+
+        // Check value of `Z(g^(n-1))`
+        consumer.constraint_last_row(*local_z - select);
+        // Check `Z(w) = combination * Z(gw)`
+        consumer.constraint_transition(*next_z * select - *local_z);
     }
 }
 
@@ -493,11 +599,6 @@ pub(crate) fn eval_cross_table_lookup_checks_circuit<
         } else {
             one
         };
-        let next_filter = if let Some(column) = filter_column {
-            column.eval_circuit(builder, vars.next_values)
-        } else {
-            one
-        };
         fn select<F: RichField + Extendable<D>, const D: usize>(
             builder: &mut CircuitBuilder<F, D>,
             filter: ExtensionTarget<D>,
@@ -508,34 +609,30 @@ pub(crate) fn eval_cross_table_lookup_checks_circuit<
             builder.mul_add_extension(filter, x, tmp) // filter * x + 1 - filter
         }
 
-        // Check value of `Z(1)`
-        let local_columns_eval = columns
-            .iter()
-            .map(|c| c.eval_circuit(builder, vars.local_values))
-            .collect::<Vec<_>>();
-        let combined_local = challenges.combine_circuit(builder, &local_columns_eval);
-        let selected_local = select(builder, local_filter, combined_local);
-        let first_row = builder.sub_extension(*local_z, selected_local);
-        consumer.constraint_first_row(builder, first_row);
-        // Check `Z(gw) = combination * Z(w)`
-        let next_columns_eval = columns
+        let evals = columns
             .iter()
-            .map(|c| c.eval_circuit(builder, vars.next_values))
+            .map(|c| c.eval_with_next_circuit(builder, vars.local_values, vars.next_values))
             .collect::<Vec<_>>();
-        let combined_next = challenges.combine_circuit(builder, &next_columns_eval);
-        let selected_next = select(builder, next_filter, combined_next);
-        let transition = builder.mul_sub_extension(*local_z, selected_next, *next_z);
+
+        let combined = challenges.combine_circuit(builder, &evals);
+        let select = select(builder, local_filter, combined);
+
+        // Check value of `Z(g^(n-1))`
+        let last_row = builder.sub_extension(*local_z, select);
+        consumer.constraint_last_row(builder, last_row);
+        // Check `Z(w) = combination * Z(gw)`
+        let transition = builder.mul_sub_extension(*next_z, select, *local_z);
         consumer.constraint_transition(builder, transition);
     }
 }
 
 pub(crate) fn verify_cross_table_lookups<F: RichField + Extendable<D>, const D: usize>(
     cross_table_lookups: &[CrossTableLookup<F>],
-    ctl_zs_lasts: [Vec<F>; NUM_TABLES],
+    ctl_zs_first: [Vec<F>; NUM_TABLES],
     ctl_extra_looking_products: Vec<Vec<F>>,
     config: &StarkConfig,
 ) -> Result<()> {
-    let mut ctl_zs_openings = ctl_zs_lasts.iter().map(|v| v.iter()).collect::<Vec<_>>();
+    let mut ctl_zs_openings = ctl_zs_first.iter().map(|v| v.iter()).collect::<Vec<_>>();
     for (
         index,
         CrossTableLookup {
@@ -568,11 +665,11 @@ pub(crate) fn verify_cross_table_lookups<F: RichField + Extendable<D>, const D:
 pub(crate) fn verify_cross_table_lookups_circuit<F: RichField + Extendable<D>, const D: usize>(
     builder: &mut CircuitBuilder<F, D>,
     cross_table_lookups: Vec<CrossTableLookup<F>>,
-    ctl_zs_lasts: [Vec<Target>; NUM_TABLES],
+    ctl_zs_first: [Vec<Target>; NUM_TABLES],
     ctl_extra_looking_products: Vec<Vec<Target>>,
     inner_config: &StarkConfig,
 ) {
-    let mut ctl_zs_openings = ctl_zs_lasts.iter().map(|v| v.iter()).collect::<Vec<_>>();
+    let mut ctl_zs_openings = ctl_zs_first.iter().map(|v| v.iter()).collect::<Vec<_>>();
     for CrossTableLookup {
         looking_tables,
         looked_table,
diff --git a/evm/src/fixed_recursive_verifier.rs b/evm/src/fixed_recursive_verifier.rs
index 8b15cde7b1..02887dd939 100644
--- a/evm/src/fixed_recursive_verifier.rs
+++ b/evm/src/fixed_recursive_verifier.rs
@@ -525,7 +525,7 @@ where
         verify_cross_table_lookups_circuit::<F, D>(
             &mut builder,
             all_cross_table_lookups(),
-            pis.map(|p| p.ctl_zs_last),
+            pis.map(|p| p.ctl_zs_first),
             extra_looking_products,
             stark_config,
         );
diff --git a/evm/src/generation/mod.rs b/evm/src/generation/mod.rs
index 13c6670ba6..35078e0784 100644
--- a/evm/src/generation/mod.rs
+++ b/evm/src/generation/mod.rs
@@ -16,6 +16,7 @@ use GlobalMetadata::{
 use crate::all_stark::{AllStark, NUM_TABLES};
 use crate::config::StarkConfig;
 use crate::cpu::bootstrap_kernel::generate_bootstrap_kernel;
+use crate::cpu::columns::CpuColumnsView;
 use crate::cpu::kernel::aggregator::KERNEL;
 use crate::cpu::kernel::constants::global_metadata::GlobalMetadata;
 use crate::generation::outputs::{get_outputs, GenerationOutputs};
@@ -278,26 +279,36 @@ pub fn generate_traces<F: RichField + Extendable<D>, const D: usize>(
 fn simulate_cpu<F: RichField + Extendable<D>, const D: usize>(
     state: &mut GenerationState<F>,
 ) -> anyhow::Result<()> {
-    let halt_pc0 = KERNEL.global_labels["halt_pc0"];
-    let halt_pc1 = KERNEL.global_labels["halt_pc1"];
+    let halt_pc = KERNEL.global_labels["halt"];
 
-    let mut already_in_halt_loop = false;
     loop {
         // If we've reached the kernel's halt routine, and our trace length is a power of 2, stop.
         let pc = state.registers.program_counter;
-        let in_halt_loop = state.registers.is_kernel && (pc == halt_pc0 || pc == halt_pc1);
-        if in_halt_loop && !already_in_halt_loop {
+        let halt = state.registers.is_kernel && pc == halt_pc;
+        if halt {
             log::info!("CPU halted after {} cycles", state.traces.clock());
-        }
-        already_in_halt_loop |= in_halt_loop;
-
-        transition(state)?;
 
-        if already_in_halt_loop && state.traces.clock().is_power_of_two() {
+            // Padding
+            let mut row = CpuColumnsView::<F>::default();
+            row.clock = F::from_canonical_usize(state.traces.clock());
+            row.context = F::from_canonical_usize(state.registers.context);
+            row.program_counter = F::from_canonical_usize(pc);
+            row.is_kernel_mode = F::ONE;
+            row.gas = F::from_canonical_u64(state.registers.gas_used);
+            row.stack_len = F::from_canonical_usize(state.registers.stack_len);
+
+            loop {
+                state.traces.push_cpu(row);
+                row.clock += F::ONE;
+                if state.traces.clock().is_power_of_two() {
+                    break;
+                }
+            }
             log::info!("CPU trace padded to {} cycles", state.traces.clock());
-            break;
+
+            return Ok(());
         }
-    }
 
-    Ok(())
+        transition(state)?;
+    }
 }
diff --git a/evm/src/keccak_sponge/columns.rs b/evm/src/keccak_sponge/columns.rs
index 44f66a5d92..431c09e092 100644
--- a/evm/src/keccak_sponge/columns.rs
+++ b/evm/src/keccak_sponge/columns.rs
@@ -5,11 +5,14 @@ use crate::util::{indices_arr, transmute_no_compile_time_size_checks};
 
 pub(crate) const KECCAK_WIDTH_BYTES: usize = 200;
 pub(crate) const KECCAK_WIDTH_U32S: usize = KECCAK_WIDTH_BYTES / 4;
+pub(crate) const KECCAK_WIDTH_MINUS_DIGEST_U32S: usize =
+    (KECCAK_WIDTH_BYTES - KECCAK_DIGEST_BYTES) / 4;
 pub(crate) const KECCAK_RATE_BYTES: usize = 136;
 pub(crate) const KECCAK_RATE_U32S: usize = KECCAK_RATE_BYTES / 4;
 pub(crate) const KECCAK_CAPACITY_BYTES: usize = 64;
 pub(crate) const KECCAK_CAPACITY_U32S: usize = KECCAK_CAPACITY_BYTES / 4;
 pub(crate) const KECCAK_DIGEST_BYTES: usize = 32;
+pub(crate) const KECCAK_DIGEST_U32S: usize = KECCAK_DIGEST_BYTES / 4;
 
 #[repr(C)]
 #[derive(Eq, PartialEq, Debug)]
@@ -52,10 +55,14 @@ pub(crate) struct KeccakSpongeColumnsView<T: Copy> {
     pub xored_rate_u32s: [T; KECCAK_RATE_U32S],
 
     /// The entire state (rate + capacity) of the sponge, encoded as 32-bit chunks, after the
-    /// permutation is applied.
-    pub updated_state_u32s: [T; KECCAK_WIDTH_U32S],
-
-    pub updated_state_bytes: [T; KECCAK_DIGEST_BYTES],
+    /// permutation is applied, minus the first limbs where the digest is extracted from.
+    /// Those missing limbs can be recomputed from their corresponding bytes stored in
+    /// `updated_digest_state_bytes`.
+    pub partial_updated_state_u32s: [T; KECCAK_WIDTH_MINUS_DIGEST_U32S],
+
+    /// The first part of the state of the sponge, seen as bytes, after the permutation is applied.
+    /// This also represents the output digest of the Keccak sponge during the squeezing phase.
+    pub updated_digest_state_bytes: [T; KECCAK_DIGEST_BYTES],
 }
 
 // `u8` is guaranteed to have a `size_of` of 1.
diff --git a/evm/src/keccak_sponge/keccak_sponge_stark.rs b/evm/src/keccak_sponge/keccak_sponge_stark.rs
index 5f1a49ccc2..d78e965141 100644
--- a/evm/src/keccak_sponge/keccak_sponge_stark.rs
+++ b/evm/src/keccak_sponge/keccak_sponge_stark.rs
@@ -28,7 +28,7 @@ pub(crate) fn ctl_looked_data<F: Field>() -> Vec<Column<F>> {
     let mut outputs = Vec::with_capacity(8);
     for i in (0..8).rev() {
         let cur_col = Column::linear_combination(
-            cols.updated_state_bytes[i * 4..(i + 1) * 4]
+            cols.updated_digest_state_bytes[i * 4..(i + 1) * 4]
                 .iter()
                 .enumerate()
                 .map(|(j, &c)| (c, F::from_canonical_u64(1 << (24 - 8 * j)))),
@@ -49,15 +49,30 @@ pub(crate) fn ctl_looked_data<F: Field>() -> Vec<Column<F>> {
 
 pub(crate) fn ctl_looking_keccak<F: Field>() -> Vec<Column<F>> {
     let cols = KECCAK_SPONGE_COL_MAP;
-    Column::singles(
+    let mut res: Vec<_> = Column::singles(
         [
             cols.xored_rate_u32s.as_slice(),
             &cols.original_capacity_u32s,
-            &cols.updated_state_u32s,
         ]
         .concat(),
     )
-    .collect()
+    .collect();
+
+    // We recover the 32-bit digest limbs from their corresponding bytes,
+    // and then append them to the rest of the updated state limbs.
+    let digest_u32s = cols.updated_digest_state_bytes.chunks_exact(4).map(|c| {
+        Column::linear_combination(
+            c.iter()
+                .enumerate()
+                .map(|(i, &b)| (b, F::from_canonical_usize(1 << (8 * i)))),
+        )
+    });
+
+    res.extend(digest_u32s);
+
+    res.extend(Column::singles(&cols.partial_updated_state_u32s));
+
+    res
 }
 
 pub(crate) fn ctl_looking_memory<F: Field>(i: usize) -> Vec<Column<F>> {
@@ -239,7 +254,21 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
                 block.try_into().unwrap(),
             );
 
-            sponge_state = row.updated_state_u32s.map(|f| f.to_canonical_u64() as u32);
+            sponge_state[..KECCAK_DIGEST_U32S]
+                .iter_mut()
+                .zip(row.updated_digest_state_bytes.chunks_exact(4))
+                .for_each(|(s, bs)| {
+                    *s = bs
+                        .iter()
+                        .enumerate()
+                        .map(|(i, b)| (b.to_canonical_u64() as u32) << (8 * i))
+                        .sum();
+                });
+
+            sponge_state[KECCAK_DIGEST_U32S..]
+                .iter_mut()
+                .zip(row.partial_updated_state_u32s)
+                .for_each(|(s, x)| *s = x.to_canonical_u64() as u32);
 
             rows.push(row.into());
             already_absorbed_bytes += KECCAK_RATE_BYTES;
@@ -357,24 +386,33 @@ impl<F: RichField + Extendable<D>, const D: usize> KeccakSpongeStark<F, D> {
         row.xored_rate_u32s = xored_rate_u32s.map(F::from_canonical_u32);
 
         keccakf_u32s(&mut sponge_state);
-        row.updated_state_u32s = sponge_state.map(F::from_canonical_u32);
-        let is_final_block = row.is_final_input_len.iter().copied().sum::<F>() == F::ONE;
-        if is_final_block {
-            for (l, &elt) in row.updated_state_u32s[..8].iter().enumerate() {
+        // Store all but the first `KECCAK_DIGEST_U32S` limbs in the updated state.
+        // Those missing limbs will be broken down into bytes and stored separately.
+        row.partial_updated_state_u32s.copy_from_slice(
+            &sponge_state[KECCAK_DIGEST_U32S..]
+                .iter()
+                .copied()
+                .map(|i| F::from_canonical_u32(i))
+                .collect::<Vec<_>>(),
+        );
+        sponge_state[..KECCAK_DIGEST_U32S]
+            .iter()
+            .enumerate()
+            .for_each(|(l, &elt)| {
                 let mut cur_elt = elt;
                 (0..4).for_each(|i| {
-                    row.updated_state_bytes[l * 4 + i] =
-                        F::from_canonical_u32((cur_elt.to_canonical_u64() & 0xFF) as u32);
-                    cur_elt = F::from_canonical_u64(cur_elt.to_canonical_u64() >> 8);
+                    row.updated_digest_state_bytes[l * 4 + i] =
+                        F::from_canonical_u32(cur_elt & 0xFF);
+                    cur_elt >>= 8;
                 });
 
-                let mut s = row.updated_state_bytes[l * 4].to_canonical_u64();
+                // 32-bit limb reconstruction consistency check.
+                let mut s = row.updated_digest_state_bytes[l * 4].to_canonical_u64();
                 for i in 1..4 {
-                    s += row.updated_state_bytes[l * 4 + i].to_canonical_u64() << (8 * i);
+                    s += row.updated_digest_state_bytes[l * 4 + i].to_canonical_u64() << (8 * i);
                 }
-                assert_eq!(elt, F::from_canonical_u64(s), "not equal");
-            }
-        }
+                assert_eq!(elt as u64, s, "not equal");
+            })
     }
 
     fn generate_padding_row(&self) -> [F; NUM_KECCAK_SPONGE_COLUMNS] {
@@ -445,26 +483,39 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for KeccakSpongeS
         );
 
         // If this is a full-input block, the next row's "before" should match our "after" state.
+        for (current_bytes_after, next_before) in local_values
+            .updated_digest_state_bytes
+            .chunks_exact(4)
+            .zip(&next_values.original_rate_u32s[..KECCAK_DIGEST_U32S])
+        {
+            let mut current_after = current_bytes_after[0];
+            for i in 1..4 {
+                current_after +=
+                    current_bytes_after[i] * P::from(FE::from_canonical_usize(1 << (8 * i)));
+            }
+            yield_constr
+                .constraint_transition(is_full_input_block * (*next_before - current_after));
+        }
         for (&current_after, &next_before) in local_values
-            .updated_state_u32s
+            .partial_updated_state_u32s
             .iter()
-            .zip(next_values.original_rate_u32s.iter())
+            .zip(next_values.original_rate_u32s[KECCAK_DIGEST_U32S..].iter())
         {
             yield_constr.constraint_transition(is_full_input_block * (next_before - current_after));
         }
         for (&current_after, &next_before) in local_values
-            .updated_state_u32s
+            .partial_updated_state_u32s
             .iter()
-            .skip(KECCAK_RATE_U32S)
+            .skip(KECCAK_RATE_U32S - KECCAK_DIGEST_U32S)
             .zip(next_values.original_capacity_u32s.iter())
         {
             yield_constr.constraint_transition(is_full_input_block * (next_before - current_after));
         }
 
-        // If this is a full-input block, the next row's already_absorbed_bytes should be ours plus 136.
+        // If this is a full-input block, the next row's already_absorbed_bytes should be ours plus `KECCAK_RATE_BYTES`.
         yield_constr.constraint_transition(
             is_full_input_block
-                * (already_absorbed_bytes + P::from(FE::from_canonical_u64(136))
+                * (already_absorbed_bytes + P::from(FE::from_canonical_usize(KECCAK_RATE_BYTES))
                     - next_values.already_absorbed_bytes),
         );
 
@@ -481,16 +532,6 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for KeccakSpongeS
             let entry_match = offset - P::from(FE::from_canonical_usize(i));
             yield_constr.constraint(is_final_len * entry_match);
         }
-
-        // Adding constraints for byte columns.
-        for (l, &elt) in local_values.updated_state_u32s[..8].iter().enumerate() {
-            let mut s = local_values.updated_state_bytes[l * 4];
-            for i in 1..4 {
-                s += local_values.updated_state_bytes[l * 4 + i]
-                    * P::from(FE::from_canonical_usize(1 << (8 * i)));
-            }
-            yield_constr.constraint(is_final_block * (s - elt));
-        }
     }
 
     fn eval_ext_circuit(
@@ -566,19 +607,36 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for KeccakSpongeS
         yield_constr.constraint_transition(builder, constraint);
 
         // If this is a full-input block, the next row's "before" should match our "after" state.
+        for (current_bytes_after, next_before) in local_values
+            .updated_digest_state_bytes
+            .chunks_exact(4)
+            .zip(&next_values.original_rate_u32s[..KECCAK_DIGEST_U32S])
+        {
+            let mut current_after = current_bytes_after[0];
+            for i in 1..4 {
+                current_after = builder.mul_const_add_extension(
+                    F::from_canonical_usize(1 << (8 * i)),
+                    current_bytes_after[i],
+                    current_after,
+                );
+            }
+            let diff = builder.sub_extension(*next_before, current_after);
+            let constraint = builder.mul_extension(is_full_input_block, diff);
+            yield_constr.constraint_transition(builder, constraint);
+        }
         for (&current_after, &next_before) in local_values
-            .updated_state_u32s
+            .partial_updated_state_u32s
             .iter()
-            .zip(next_values.original_rate_u32s.iter())
+            .zip(next_values.original_rate_u32s[KECCAK_DIGEST_U32S..].iter())
         {
             let diff = builder.sub_extension(next_before, current_after);
             let constraint = builder.mul_extension(is_full_input_block, diff);
             yield_constr.constraint_transition(builder, constraint);
         }
         for (&current_after, &next_before) in local_values
-            .updated_state_u32s
+            .partial_updated_state_u32s
             .iter()
-            .skip(KECCAK_RATE_U32S)
+            .skip(KECCAK_RATE_U32S - KECCAK_DIGEST_U32S)
             .zip(next_values.original_capacity_u32s.iter())
         {
             let diff = builder.sub_extension(next_before, current_after);
@@ -586,9 +644,11 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for KeccakSpongeS
             yield_constr.constraint_transition(builder, constraint);
         }
 
-        // If this is a full-input block, the next row's already_absorbed_bytes should be ours plus 136.
-        let absorbed_bytes =
-            builder.add_const_extension(already_absorbed_bytes, F::from_canonical_u64(136));
+        // If this is a full-input block, the next row's already_absorbed_bytes should be ours plus `KECCAK_RATE_BYTES`.
+        let absorbed_bytes = builder.add_const_extension(
+            already_absorbed_bytes,
+            F::from_canonical_usize(KECCAK_RATE_BYTES),
+        );
         let absorbed_diff =
             builder.sub_extension(absorbed_bytes, next_values.already_absorbed_bytes);
         let constraint = builder.mul_extension(is_full_input_block, absorbed_diff);
@@ -615,21 +675,6 @@ impl<F: RichField + Extendable<D>, const D: usize> Stark<F, D> for KeccakSpongeS
             let constraint = builder.mul_extension(is_final_len, entry_match);
             yield_constr.constraint(builder, constraint);
         }
-
-        // Adding constraints for byte columns.
-        for (l, &elt) in local_values.updated_state_u32s[..8].iter().enumerate() {
-            let mut s = local_values.updated_state_bytes[l * 4];
-            for i in 1..4 {
-                s = builder.mul_const_add_extension(
-                    F::from_canonical_usize(1 << (8 * i)),
-                    local_values.updated_state_bytes[l * 4 + i],
-                    s,
-                );
-            }
-            let constraint = builder.sub_extension(s, elt);
-            let constraint = builder.mul_extension(is_final_block, constraint);
-            yield_constr.constraint(builder, constraint);
-        }
     }
 
     fn constraint_degree(&self) -> usize {
@@ -698,9 +743,10 @@ mod tests {
         let rows = stark.generate_rows_for_op(op);
         assert_eq!(rows.len(), 1);
         let last_row: &KeccakSpongeColumnsView<F> = rows.last().unwrap().borrow();
-        let output = last_row.updated_state_u32s[..8]
+        let output = last_row
+            .updated_digest_state_bytes
             .iter()
-            .flat_map(|x| (x.to_canonical_u64() as u32).to_le_bytes())
+            .map(|x| x.to_canonical_u64() as u8)
             .collect_vec();
 
         assert_eq!(output, expected_output.0);
diff --git a/evm/src/proof.rs b/evm/src/proof.rs
index 14f22b6791..76f3af32f0 100644
--- a/evm/src/proof.rs
+++ b/evm/src/proof.rs
@@ -623,7 +623,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> S
     }
 
     pub fn num_ctl_zs(&self) -> usize {
-        self.openings.ctl_zs_last.len()
+        self.openings.ctl_zs_first.len()
     }
 }
 
@@ -704,8 +704,8 @@ pub struct StarkOpeningSet<F: RichField + Extendable<D>, const D: usize> {
     pub permutation_ctl_zs: Vec<F::Extension>,
     /// Openings of permutations and cross-table lookups `Z` polynomials at `g * zeta`.
     pub permutation_ctl_zs_next: Vec<F::Extension>,
-    /// Openings of cross-table lookups `Z` polynomials at `g^-1`.
-    pub ctl_zs_last: Vec<F>,
+    /// Openings of cross-table lookups `Z` polynomials at `1`.
+    pub ctl_zs_first: Vec<F>,
     /// Openings of quotient polynomials at `zeta`.
     pub quotient_polys: Vec<F::Extension>,
 }
@@ -717,7 +717,6 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
         trace_commitment: &PolynomialBatch<F, C, D>,
         permutation_ctl_zs_commitment: &PolynomialBatch<F, C, D>,
         quotient_commitment: &PolynomialBatch<F, C, D>,
-        degree_bits: usize,
         num_permutation_zs: usize,
     ) -> Self {
         let eval_commitment = |z: F::Extension, c: &PolynomialBatch<F, C, D>| {
@@ -738,10 +737,8 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
             next_values: eval_commitment(zeta_next, trace_commitment),
             permutation_ctl_zs: eval_commitment(zeta, permutation_ctl_zs_commitment),
             permutation_ctl_zs_next: eval_commitment(zeta_next, permutation_ctl_zs_commitment),
-            ctl_zs_last: eval_commitment_base(
-                F::primitive_root_of_unity(degree_bits).inverse(),
-                permutation_ctl_zs_commitment,
-            )[num_permutation_zs..]
+            ctl_zs_first: eval_commitment_base(F::ONE, permutation_ctl_zs_commitment)
+                [num_permutation_zs..]
                 .to_vec(),
             quotient_polys: eval_commitment(zeta, quotient_commitment),
         }
@@ -765,10 +762,10 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
                 .copied()
                 .collect_vec(),
         };
-        debug_assert!(!self.ctl_zs_last.is_empty());
-        let ctl_last_batch = FriOpeningBatch {
+        debug_assert!(!self.ctl_zs_first.is_empty());
+        let ctl_first_batch = FriOpeningBatch {
             values: self
-                .ctl_zs_last
+                .ctl_zs_first
                 .iter()
                 .copied()
                 .map(F::Extension::from_basefield)
@@ -776,7 +773,7 @@ impl<F: RichField + Extendable<D>, const D: usize> StarkOpeningSet<F, D> {
         };
 
         FriOpenings {
-            batches: vec![zeta_batch, zeta_next_batch, ctl_last_batch],
+            batches: vec![zeta_batch, zeta_next_batch, ctl_first_batch],
         }
     }
 }
@@ -787,7 +784,7 @@ pub struct StarkOpeningSetTarget<const D: usize> {
     pub next_values: Vec<ExtensionTarget<D>>,
     pub permutation_ctl_zs: Vec<ExtensionTarget<D>>,
     pub permutation_ctl_zs_next: Vec<ExtensionTarget<D>>,
-    pub ctl_zs_last: Vec<Target>,
+    pub ctl_zs_first: Vec<Target>,
     pub quotient_polys: Vec<ExtensionTarget<D>>,
 }
 
@@ -797,7 +794,7 @@ impl<const D: usize> StarkOpeningSetTarget<D> {
         buffer.write_target_ext_vec(&self.next_values)?;
         buffer.write_target_ext_vec(&self.permutation_ctl_zs)?;
         buffer.write_target_ext_vec(&self.permutation_ctl_zs_next)?;
-        buffer.write_target_vec(&self.ctl_zs_last)?;
+        buffer.write_target_vec(&self.ctl_zs_first)?;
         buffer.write_target_ext_vec(&self.quotient_polys)?;
         Ok(())
     }
@@ -807,7 +804,7 @@ impl<const D: usize> StarkOpeningSetTarget<D> {
         let next_values = buffer.read_target_ext_vec::<D>()?;
         let permutation_ctl_zs = buffer.read_target_ext_vec::<D>()?;
         let permutation_ctl_zs_next = buffer.read_target_ext_vec::<D>()?;
-        let ctl_zs_last = buffer.read_target_vec()?;
+        let ctl_zs_first = buffer.read_target_vec()?;
         let quotient_polys = buffer.read_target_ext_vec::<D>()?;
 
         Ok(Self {
@@ -815,7 +812,7 @@ impl<const D: usize> StarkOpeningSetTarget<D> {
             next_values,
             permutation_ctl_zs,
             permutation_ctl_zs_next,
-            ctl_zs_last,
+            ctl_zs_first,
             quotient_polys,
         })
     }
@@ -838,10 +835,10 @@ impl<const D: usize> StarkOpeningSetTarget<D> {
                 .copied()
                 .collect_vec(),
         };
-        debug_assert!(!self.ctl_zs_last.is_empty());
-        let ctl_last_batch = FriOpeningBatchTarget {
+        debug_assert!(!self.ctl_zs_first.is_empty());
+        let ctl_first_batch = FriOpeningBatchTarget {
             values: self
-                .ctl_zs_last
+                .ctl_zs_first
                 .iter()
                 .copied()
                 .map(|t| t.to_ext_target(zero))
@@ -849,7 +846,7 @@ impl<const D: usize> StarkOpeningSetTarget<D> {
         };
 
         FriOpeningsTarget {
-            batches: vec![zeta_batch, zeta_next_batch, ctl_last_batch],
+            batches: vec![zeta_batch, zeta_next_batch, ctl_first_batch],
         }
     }
 }
diff --git a/evm/src/prover.rs b/evm/src/prover.rs
index 425634943e..7b960c95a2 100644
--- a/evm/src/prover.rs
+++ b/evm/src/prover.rs
@@ -454,7 +454,6 @@ where
         trace_commitment,
         &permutation_ctl_zs_commitment,
         &quotient_commitment,
-        degree_bits,
         stark.num_permutation_batches(config),
     );
     challenger.observe_openings(&openings.to_fri_openings());
@@ -469,7 +468,7 @@ where
         timing,
         "compute openings proof",
         PolynomialBatch::prove_openings(
-            &stark.fri_instance(zeta, g, degree_bits, ctl_data.len(), config),
+            &stark.fri_instance(zeta, g, ctl_data.len(), config),
             &initial_merkle_trees,
             challenger,
             &fri_params,
diff --git a/evm/src/recursive_verifier.rs b/evm/src/recursive_verifier.rs
index 531669c03e..d58344bbf7 100644
--- a/evm/src/recursive_verifier.rs
+++ b/evm/src/recursive_verifier.rs
@@ -60,7 +60,7 @@ pub struct RecursiveAllProof<
 pub(crate) struct PublicInputs<T: Copy + Default + Eq + PartialEq + Debug, P: PlonkyPermutation<T>>
 {
     pub(crate) trace_cap: Vec<Vec<T>>,
-    pub(crate) ctl_zs_last: Vec<T>,
+    pub(crate) ctl_zs_first: Vec<T>,
     pub(crate) ctl_challenges: GrandProductChallengeSet<T>,
     pub(crate) challenger_state_before: P,
     pub(crate) challenger_state_after: P,
@@ -86,11 +86,11 @@ impl<T: Copy + Debug + Default + Eq + PartialEq, P: PlonkyPermutation<T>> Public
         };
         let challenger_state_before = P::new(&mut iter);
         let challenger_state_after = P::new(&mut iter);
-        let ctl_zs_last: Vec<_> = iter.collect();
+        let ctl_zs_first: Vec<_> = iter.collect();
 
         Self {
             trace_cap,
-            ctl_zs_last,
+            ctl_zs_first,
             ctl_challenges,
             challenger_state_before,
             challenger_state_after,
@@ -151,7 +151,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
         // Verify the CTL checks.
         verify_cross_table_lookups::<F, D>(
             &cross_table_lookups,
-            pis.map(|p| p.ctl_zs_last),
+            pis.map(|p| p.ctl_zs_first),
             extra_looking_products,
             inner_config,
         )?;
@@ -351,7 +351,7 @@ where
     let challenger_state = challenger.compact(&mut builder);
     builder.register_public_inputs(challenger_state.as_ref());
 
-    builder.register_public_inputs(&proof_target.openings.ctl_zs_last);
+    builder.register_public_inputs(&proof_target.openings.ctl_zs_first);
 
     verify_stark_proof_with_challenges_circuit::<F, C, _, D>(
         &mut builder,
@@ -415,7 +415,7 @@ fn verify_stark_proof_with_challenges_circuit<
         next_values,
         permutation_ctl_zs,
         permutation_ctl_zs_next,
-        ctl_zs_last,
+        ctl_zs_first,
         quotient_polys,
     } = &proof.openings;
     let vars = StarkEvaluationTargets {
@@ -485,8 +485,7 @@ fn verify_stark_proof_with_challenges_circuit<
         builder,
         challenges.stark_zeta,
         F::primitive_root_of_unity(degree_bits),
-        degree_bits,
-        ctl_zs_last.len(),
+        ctl_zs_first.len(),
         inner_config,
     );
     builder.verify_fri_proof::<C>(
@@ -870,7 +869,7 @@ fn add_virtual_stark_opening_set<F: RichField + Extendable<D>, S: Stark<F, D>, c
             .add_virtual_extension_targets(stark.num_permutation_batches(config) + num_ctl_zs),
         permutation_ctl_zs_next: builder
             .add_virtual_extension_targets(stark.num_permutation_batches(config) + num_ctl_zs),
-        ctl_zs_last: builder.add_virtual_targets(num_ctl_zs),
+        ctl_zs_first: builder.add_virtual_targets(num_ctl_zs),
         quotient_polys: builder
             .add_virtual_extension_targets(stark.quotient_degree_factor() * num_challenges),
     }
diff --git a/evm/src/stark.rs b/evm/src/stark.rs
index 72cee0ad60..73b51ada41 100644
--- a/evm/src/stark.rs
+++ b/evm/src/stark.rs
@@ -84,7 +84,6 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         &self,
         zeta: F::Extension,
         g: F,
-        degree_bits: usize,
         num_ctl_zs: usize,
         config: &StarkConfig,
     ) -> FriInstanceInfo<F, D> {
@@ -131,13 +130,13 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
             point: zeta.scalar_mul(g),
             polynomials: [trace_info, permutation_ctl_zs_info].concat(),
         };
-        let ctl_last_batch = FriBatchInfo {
-            point: F::Extension::primitive_root_of_unity(degree_bits).inverse(),
+        let ctl_first_batch = FriBatchInfo {
+            point: F::Extension::ONE,
             polynomials: ctl_zs_info,
         };
         FriInstanceInfo {
             oracles: vec![trace_oracle, permutation_ctl_oracle, quotient_oracle],
-            batches: vec![zeta_batch, zeta_next_batch, ctl_last_batch],
+            batches: vec![zeta_batch, zeta_next_batch, ctl_first_batch],
         }
     }
 
@@ -147,7 +146,6 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
         builder: &mut CircuitBuilder<F, D>,
         zeta: ExtensionTarget<D>,
         g: F,
-        degree_bits: usize,
         num_ctl_zs: usize,
         inner_config: &StarkConfig,
     ) -> FriInstanceInfoTarget<D> {
@@ -195,14 +193,13 @@ pub trait Stark<F: RichField + Extendable<D>, const D: usize>: Sync {
             point: zeta_next,
             polynomials: [trace_info, permutation_ctl_zs_info].concat(),
         };
-        let ctl_last_batch = FriBatchInfoTarget {
-            point: builder
-                .constant_extension(F::Extension::primitive_root_of_unity(degree_bits).inverse()),
+        let ctl_first_batch = FriBatchInfoTarget {
+            point: builder.one_extension(),
             polynomials: ctl_zs_info,
         };
         FriInstanceInfoTarget {
             oracles: vec![trace_oracle, permutation_ctl_oracle, quotient_oracle],
-            batches: vec![zeta_batch, zeta_next_batch, ctl_last_batch],
+            batches: vec![zeta_batch, zeta_next_batch, ctl_first_batch],
         }
     }
 
diff --git a/evm/src/verifier.rs b/evm/src/verifier.rs
index 297d9276a6..11f8155d29 100644
--- a/evm/src/verifier.rs
+++ b/evm/src/verifier.rs
@@ -137,7 +137,9 @@ where
 
     verify_cross_table_lookups::<F, D>(
         cross_table_lookups,
-        all_proof.stark_proofs.map(|p| p.proof.openings.ctl_zs_last),
+        all_proof
+            .stark_proofs
+            .map(|p| p.proof.openings.ctl_zs_first),
         extra_looking_products,
         config,
     )
@@ -310,7 +312,7 @@ where
         next_values,
         permutation_ctl_zs,
         permutation_ctl_zs_next,
-        ctl_zs_last,
+        ctl_zs_first,
         quotient_polys,
     } = &proof.openings;
     let vars = StarkEvaluationVars {
@@ -376,8 +378,7 @@ where
         &stark.fri_instance(
             challenges.stark_zeta,
             F::primitive_root_of_unity(degree_bits),
-            degree_bits,
-            ctl_zs_last.len(),
+            ctl_zs_first.len(),
             config,
         ),
         &proof.openings.to_fri_openings(),
@@ -417,7 +418,7 @@ where
         next_values,
         permutation_ctl_zs,
         permutation_ctl_zs_next,
-        ctl_zs_last,
+        ctl_zs_first,
         quotient_polys,
     } = openings;
 
@@ -434,7 +435,7 @@ where
     ensure!(next_values.len() == S::COLUMNS);
     ensure!(permutation_ctl_zs.len() == num_zs);
     ensure!(permutation_ctl_zs_next.len() == num_zs);
-    ensure!(ctl_zs_last.len() == num_ctl_zs);
+    ensure!(ctl_zs_first.len() == num_ctl_zs);
     ensure!(quotient_polys.len() == stark.num_quotient_polys(config));
 
     Ok(())
diff --git a/evm/src/witness/gas.rs b/evm/src/witness/gas.rs
index 3a46c04439..aa312078a5 100644
--- a/evm/src/witness/gas.rs
+++ b/evm/src/witness/gas.rs
@@ -25,8 +25,8 @@ pub(crate) fn gas_to_charge(op: Operation) -> u64 {
         BinaryArithmetic(Lt) => G_VERYLOW,
         BinaryArithmetic(Gt) => G_VERYLOW,
         BinaryArithmetic(Byte) => G_VERYLOW,
-        Shl => G_VERYLOW,
-        Shr => G_VERYLOW,
+        BinaryArithmetic(Shl) => G_VERYLOW,
+        BinaryArithmetic(Shr) => G_VERYLOW,
         BinaryArithmetic(AddFp254) => KERNEL_ONLY_INSTR,
         BinaryArithmetic(MulFp254) => KERNEL_ONLY_INSTR,
         BinaryArithmetic(SubFp254) => KERNEL_ONLY_INSTR,
diff --git a/evm/src/witness/operation.rs b/evm/src/witness/operation.rs
index b1339d0cee..8349d56dfd 100644
--- a/evm/src/witness/operation.rs
+++ b/evm/src/witness/operation.rs
@@ -29,8 +29,6 @@ use crate::{arithmetic, logic};
 pub(crate) enum Operation {
     Iszero,
     Not,
-    Shl,
-    Shr,
     Syscall(u8, usize, bool), // (syscall number, minimum stack length, increases stack length)
     Eq,
     BinaryLogic(logic::Op),
@@ -473,6 +471,7 @@ pub(crate) fn generate_iszero<F: Field>(
 fn append_shift<F: Field>(
     state: &mut GenerationState<F>,
     mut row: CpuColumnsView<F>,
+    is_shl: bool,
     input0: U256,
     input1: U256,
     log_in0: MemoryOp,
@@ -500,10 +499,10 @@ fn append_shift<F: Field>(
     } else {
         U256::one() << input0
     };
-    let operator = if row.op.shl.is_one() {
-        BinaryOperator::Mul
+    let operator = if is_shl {
+        BinaryOperator::Shl
     } else {
-        BinaryOperator::Div
+        BinaryOperator::Shr
     };
     let operation = arithmetic::Operation::binary(operator, input1, input0);
 
@@ -527,7 +526,7 @@ pub(crate) fn generate_shl<F: Field>(
     } else {
         input1 << input0
     };
-    append_shift(state, row, input0, input1, log_in0, log_in1, result)
+    append_shift(state, row, true, input0, input1, log_in0, log_in1, result)
 }
 
 pub(crate) fn generate_shr<F: Field>(
@@ -542,7 +541,7 @@ pub(crate) fn generate_shr<F: Field>(
     } else {
         input1 >> input0
     };
-    append_shift(state, row, input0, input1, log_in0, log_in1, result)
+    append_shift(state, row, false, input0, input1, log_in0, log_in1, result)
 }
 
 pub(crate) fn generate_syscall<F: Field>(
diff --git a/evm/src/witness/transition.rs b/evm/src/witness/transition.rs
index 6e279cdf7a..1418beba8d 100644
--- a/evm/src/witness/transition.rs
+++ b/evm/src/witness/transition.rs
@@ -70,8 +70,8 @@ fn decode(registers: RegistersState, opcode: u8) -> Result<Operation, ProgramErr
         (0x1a, _) => Ok(Operation::BinaryArithmetic(
             arithmetic::BinaryOperator::Byte,
         )),
-        (0x1b, _) => Ok(Operation::Shl),
-        (0x1c, _) => Ok(Operation::Shr),
+        (0x1b, _) => Ok(Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shl)),
+        (0x1c, _) => Ok(Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shr)),
         (0x1d, _) => Ok(Operation::Syscall(opcode, 2, false)), // SAR
         (0x20, _) => Ok(Operation::Syscall(opcode, 2, false)), // KECCAK256
         (0x21, true) => Ok(Operation::KeccakGeneral),
@@ -162,22 +162,13 @@ fn fill_op_flag<F: Field>(op: Operation, row: &mut CpuColumnsView<F>) {
         Operation::Not => &mut flags.not,
         Operation::Syscall(_, _, _) => &mut flags.syscall,
         Operation::BinaryLogic(_) => &mut flags.logic_op,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Add) => &mut flags.add,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Mul) => &mut flags.mul,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Sub) => &mut flags.sub,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Div) => &mut flags.div,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Mod) => &mut flags.mod_,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Lt) => &mut flags.lt,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Gt) => &mut flags.gt,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Byte) => &mut flags.byte,
-        Operation::Shl => &mut flags.shl,
-        Operation::Shr => &mut flags.shr,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::AddFp254) => &mut flags.addfp254,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::MulFp254) => &mut flags.mulfp254,
-        Operation::BinaryArithmetic(arithmetic::BinaryOperator::SubFp254) => &mut flags.subfp254,
-        Operation::TernaryArithmetic(arithmetic::TernaryOperator::AddMod) => &mut flags.addmod,
-        Operation::TernaryArithmetic(arithmetic::TernaryOperator::MulMod) => &mut flags.mulmod,
-        Operation::TernaryArithmetic(arithmetic::TernaryOperator::SubMod) => &mut flags.submod,
+        Operation::BinaryArithmetic(arithmetic::BinaryOperator::AddFp254)
+        | Operation::BinaryArithmetic(arithmetic::BinaryOperator::MulFp254)
+        | Operation::BinaryArithmetic(arithmetic::BinaryOperator::SubFp254) => &mut flags.fp254_op,
+        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shl)
+        | Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shr) => &mut flags.shift,
+        Operation::BinaryArithmetic(_) => &mut flags.binary_op,
+        Operation::TernaryArithmetic(_) => &mut flags.ternary_op,
         Operation::KeccakGeneral => &mut flags.keccak_general,
         Operation::ProverInput => &mut flags.prover_input,
         Operation::Pop => &mut flags.pop,
@@ -188,8 +179,7 @@ fn fill_op_flag<F: Field>(op: Operation, row: &mut CpuColumnsView<F>) {
         Operation::Mload32Bytes => &mut flags.mload_32bytes,
         Operation::Mstore32Bytes => &mut flags.mstore_32bytes,
         Operation::ExitKernel => &mut flags.exit_kernel,
-        Operation::MloadGeneral => &mut flags.mload_general,
-        Operation::MstoreGeneral => &mut flags.mstore_general,
+        Operation::MloadGeneral | Operation::MstoreGeneral => &mut flags.m_op_general,
     } = F::ONE;
 }
 
@@ -204,8 +194,8 @@ fn perform_op<F: Field>(
         Operation::Swap(n) => generate_swap(n, state, row)?,
         Operation::Iszero => generate_iszero(state, row)?,
         Operation::Not => generate_not(state, row)?,
-        Operation::Shl => generate_shl(state, row)?,
-        Operation::Shr => generate_shr(state, row)?,
+        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shl) => generate_shl(state, row)?,
+        Operation::BinaryArithmetic(arithmetic::BinaryOperator::Shr) => generate_shr(state, row)?,
         Operation::Syscall(opcode, stack_values_read, stack_len_increased) => {
             generate_syscall(opcode, stack_values_read, stack_len_increased, state, row)?
         }
@@ -296,7 +286,7 @@ fn log_kernel_instruction<F: Field>(state: &GenerationState<F>, op: Operation) {
     let pc = state.registers.program_counter;
     let is_interesting_offset = KERNEL
         .offset_label(pc)
-        .filter(|label| !label.starts_with("halt_pc"))
+        .filter(|label| !label.starts_with("halt"))
         .is_some();
     let level = if is_interesting_offset {
         log::Level::Debug
diff --git a/evm/tests/basic_smart_contract.rs b/evm/tests/basic_smart_contract.rs
index 4d0a2090b6..2cd549ff9e 100644
--- a/evm/tests/basic_smart_contract.rs
+++ b/evm/tests/basic_smart_contract.rs
@@ -53,7 +53,10 @@ fn test_basic_smart_contract() -> anyhow::Result<()> {
     let code_gas = 3 + 3 + 3;
     let code_hash = keccak(code);
 
-    let beneficiary_account_before = AccountRlp::default();
+    let beneficiary_account_before = AccountRlp {
+        nonce: 1.into(),
+        ..AccountRlp::default()
+    };
     let sender_account_before = AccountRlp {
         nonce: 5.into(),
         balance: eth_to_wei(100_000.into()),
@@ -66,6 +69,11 @@ fn test_basic_smart_contract() -> anyhow::Result<()> {
 
     let state_trie_before = {
         let mut children = core::array::from_fn(|_| Node::Empty.into());
+        children[beneficiary_nibbles.get_nibble(0) as usize] = Node::Leaf {
+            nibbles: beneficiary_nibbles.truncate_n_nibbles_front(1),
+            value: rlp::encode(&beneficiary_account_before).to_vec(),
+        }
+        .into();
         children[sender_nibbles.get_nibble(0) as usize] = Node::Leaf {
             nibbles: sender_nibbles.truncate_n_nibbles_front(1),
             value: rlp::encode(&sender_account_before).to_vec(),
@@ -90,25 +98,33 @@ fn test_basic_smart_contract() -> anyhow::Result<()> {
         storage_tries: vec![],
     };
 
+    let txdata_gas = 2 * 16;
+    let gas_used = 21_000 + code_gas + txdata_gas;
+
     // Generated using a little py-evm script.
     let txn = hex!("f861050a8255f094a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0a0648242421ba02c89eb757d9deeb1f5b3859a9d4d679951ef610ac47ad4608dc142beb1b7e313a05af7e9fbab825455d36c36c7f4cfcafbeafa9a77bdff936b52afb36d4fe4bcdd");
     let value = U256::from(100u32);
 
     let block_metadata = BlockMetadata {
         block_beneficiary: Address::from(beneficiary),
-        ..BlockMetadata::default()
+        block_difficulty: 0x20000.into(),
+        block_number: 1.into(),
+        block_chain_id: 1.into(),
+        block_timestamp: 0x03e8.into(),
+        block_gaslimit: 0xff112233u32.into(),
+        block_gas_used: gas_used.into(),
+        block_bloom: [0.into(); 8],
+        block_base_fee: 0xa.into(),
     };
 
     let mut contract_code = HashMap::new();
     contract_code.insert(keccak(vec![]), vec![]);
     contract_code.insert(code_hash, code.to_vec());
 
-    let txdata_gas = 2 * 16;
-    let gas_used = 21_000 + code_gas + txdata_gas;
     let expected_state_trie_after: HashedPartialTrie = {
         let beneficiary_account_after = AccountRlp {
-            balance: beneficiary_account_before.balance + gas_used * 10,
-            ..beneficiary_account_before
+            nonce: 1.into(),
+            ..AccountRlp::default()
         };
         let sender_account_after = AccountRlp {
             balance: sender_account_before.balance - value - gas_used * 10,
diff --git a/evm/tests/self_balance_gas_cost.rs b/evm/tests/self_balance_gas_cost.rs
index d346164725..9ba1ac5497 100644
--- a/evm/tests/self_balance_gas_cost.rs
+++ b/evm/tests/self_balance_gas_cost.rs
@@ -5,7 +5,7 @@ use std::time::Duration;
 use env_logger::{try_init_from_env, Env, DEFAULT_FILTER_ENV};
 use eth_trie_utils::nibbles::Nibbles;
 use eth_trie_utils::partial_trie::{HashedPartialTrie, PartialTrie};
-use ethereum_types::{Address, H256};
+use ethereum_types::{Address, H256, U256};
 use hex_literal::hex;
 use keccak_hash::keccak;
 use plonky2::field::goldilocks_field::GoldilocksField;
@@ -62,7 +62,10 @@ fn self_balance_gas_cost() -> anyhow::Result<()> {
     + 22100; // SSTORE
     let code_hash = keccak(code);
 
-    let beneficiary_account_before = AccountRlp::default();
+    let beneficiary_account_before = AccountRlp {
+        nonce: 1.into(),
+        ..AccountRlp::default()
+    };
     let sender_account_before = AccountRlp {
         balance: 0x3635c9adc5dea00000u128.into(),
         ..AccountRlp::default()
@@ -89,10 +92,18 @@ fn self_balance_gas_cost() -> anyhow::Result<()> {
 
     let txn = hex!("f861800a8405f5e10094100000000000000000000000000000000000000080801ba07e09e26678ed4fac08a249ebe8ed680bf9051a5e14ad223e4b2b9d26e0208f37a05f6e3f188e3e6eab7d7d3b6568f5eac7d687b08d307d3154ccd8c87b4630509b");
 
+    let gas_used = 21_000 + code_gas;
+
     let block_metadata = BlockMetadata {
         block_beneficiary: Address::from(beneficiary),
+        block_difficulty: 0x20000.into(),
+        block_number: 1.into(),
+        block_chain_id: 1.into(),
+        block_timestamp: 0x03e8.into(),
+        block_gaslimit: 0xff112233u32.into(),
+        block_gas_used: gas_used.into(),
+        block_bloom: [0.into(); 8],
         block_base_fee: 0xa.into(),
-        ..BlockMetadata::default()
     };
 
     let mut contract_code = HashMap::new();
@@ -100,9 +111,12 @@ fn self_balance_gas_cost() -> anyhow::Result<()> {
     contract_code.insert(code_hash, code.to_vec());
 
     let expected_state_trie_after = {
-        let beneficiary_account_after = AccountRlp::default();
+        let beneficiary_account_after = AccountRlp {
+            nonce: 1.into(),
+            ..AccountRlp::default()
+        };
         let sender_account_after = AccountRlp {
-            balance: 999999999999999568680u128.into(),
+            balance: sender_account_before.balance - U256::from(gas_used) * U256::from(10),
             nonce: 1.into(),
             ..AccountRlp::default()
         };
@@ -132,7 +146,6 @@ fn self_balance_gas_cost() -> anyhow::Result<()> {
         expected_state_trie_after
     };
 
-    let gas_used = 21_000 + code_gas;
     let receipt_0 = LegacyReceiptRlp {
         status: true,
         cum_gas_used: gas_used.into(),